From 92a191091472f3bc3c7d4cde9d2ba182d0f84f6a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 1 Jun 2024 15:02:55 +0200 Subject: [PATCH 01/33] [susy] in tmad/madX.sh, use channelId=1 by default but use a different channelId for susy_gg_t1t1 (fix issue #826) --- epochX/cudacpp/tmad/madX.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index eaeaf654fc..b9ef08593f 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -257,6 +257,7 @@ function getgridmax() # Create an input file that is appropriate for the specific process function getinputfile() { + channelId=1 # use channelId=1 by default nevt=$(getnevt) tmpdir=/tmp/$USER mkdir -p $tmpdir @@ -280,6 +281,7 @@ function getinputfile() tmp=$tmpdir/input_susyggtt elif [ "${susyggt1t1}" == "1" ]; then tmp=$tmpdir/input_susyggt1t1 + channelId=3 # channelId=1 does not exist in susyggt1t1 (issue #826) elif [ "${smeftggtttt}" == "1" ]; then tmp=$tmpdir/input_smeftggtttt else @@ -308,7 +310,7 @@ ${nevt} 1 1 ! Number of events and max and min iterations 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +${channelId} ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) EOF echo ${tmp} } From 9d634f61944eb7e1b07a1b81d1e412bb00b585c4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 1 Jun 2024 20:28:02 +0200 Subject: [PATCH 02/33] [susy] improve comments and variable names in tmad/madX.sh: iconfig not channelId (and note that iconfig=1 is ok) --- epochX/cudacpp/tmad/madX.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index b9ef08593f..d81435a27b 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -257,7 +257,7 @@ function getgridmax() # Create an input file that is appropriate for the specific process function getinputfile() { - channelId=1 # use channelId=1 by default + iconfig=1 # use iconfig=1 by default (NB: this does not mean channel_id=1 i.e. the first diagram, see #826) nevt=$(getnevt) tmpdir=/tmp/$USER mkdir -p $tmpdir @@ -281,7 +281,7 @@ function getinputfile() tmp=$tmpdir/input_susyggtt elif [ "${susyggt1t1}" == "1" ]; then tmp=$tmpdir/input_susyggt1t1 - channelId=3 # channelId=1 does not exist in susyggt1t1 (issue #826) + iconfig=2 # try to use a different iconfig in susyggt1t1 (issue #826) elif [ "${smeftggtttt}" == "1" ]; then tmp=$tmpdir/input_smeftggtttt else @@ -310,7 +310,7 @@ ${nevt} 1 1 ! Number of events and max and min iterations 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -${channelId} ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +${iconfig} ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) EOF echo ${tmp} } From 8e001fb98769384a182e7e73b20baa41e7135317 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 07:33:29 +0200 Subject: [PATCH 03/33] [susy] in tmad/madX.sh, keep iconfig=1 for the moment also for susy_gg_t1t1 (will give zero cross section #826) --- epochX/cudacpp/tmad/madX.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index d81435a27b..03ed6d08f0 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -281,7 +281,7 @@ function getinputfile() tmp=$tmpdir/input_susyggtt elif [ "${susyggt1t1}" == "1" ]; then tmp=$tmpdir/input_susyggt1t1 - iconfig=2 # try to use a different iconfig in susyggt1t1 (issue #826) + ###iconfig=2 # try to use a different iconfig in susyggt1t1 (issue #826) elif [ "${smeftggtttt}" == "1" ]; then tmp=$tmpdir/input_smeftggtttt else From 93a03f0ea05bba7e936df707cbcb1317fa81d996 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 09:05:04 +0200 Subject: [PATCH 04/33] [tmad] in tmad/madX.sh, add optional argument "-iconfig " to test a different iconfig In particular: the following triggers a SIGFPE reported in #855 (crash in rotxxx that can be fixed adding volatile?) ./tmad/madX.sh -ggttgg -iconfig 104 -makeclean This also triggers a similar SIGFPE (initially reported in #826) ./tmad/madX.sh -susyggt1t1 -iconfig 2 -makeclean --- epochX/cudacpp/tmad/madX.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index 03ed6d08f0..b6d2113461 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -28,7 +28,7 @@ export CUDACPP_RUNTIME_VECSIZEUSED=${NLOOP} function usage() { - echo "Usage: $0 [-d] [-fltonly|-mixonly] [-makeonly|-makeclean|-makecleanonly] [-rmrdat] [+10x] [-checkonly] [-nocleanup]" > /dev/stderr + echo "Usage: $0 [-d] [-fltonly|-mixonly] [-makeonly|-makeclean|-makecleanonly] [-rmrdat] [+10x] [-checkonly] [-nocleanup][-iconfig ]" > /dev/stderr echo "(NB: OMP_NUM_THREADS is taken as-is from the caller's environment)" exit 1 } @@ -64,6 +64,8 @@ checkonly=0 nocleanup=0 +iconfig= + while [ "$1" != "" ]; do if [ "$1" == "-d" ]; then debug=1 @@ -131,6 +133,9 @@ while [ "$1" != "" ]; do elif [ "$1" == "-nocleanup" ]; then nocleanup=1 shift + elif [ "$1" == "-iconfig" ] && [ "$2" != "" ]; then + iconfig=$2 + shift; shift else usage fi @@ -257,7 +262,7 @@ function getgridmax() # Create an input file that is appropriate for the specific process function getinputfile() { - iconfig=1 # use iconfig=1 by default (NB: this does not mean channel_id=1 i.e. the first diagram, see #826) + iconfig_proc=1 # use iconfig=1 by default (NB: this does not mean channel_id=1 i.e. the first diagram, see #826) nevt=$(getnevt) tmpdir=/tmp/$USER mkdir -p $tmpdir @@ -281,7 +286,7 @@ function getinputfile() tmp=$tmpdir/input_susyggtt elif [ "${susyggt1t1}" == "1" ]; then tmp=$tmpdir/input_susyggt1t1 - ###iconfig=2 # try to use a different iconfig in susyggt1t1 (issue #826) + ###iconfig_proc=2 # try to use a different iconfig in susyggt1t1 (issue #826) elif [ "${smeftggtttt}" == "1" ]; then tmp=$tmpdir/input_smeftggtttt else @@ -303,6 +308,7 @@ function getinputfile() echo "Usage: getinputfile " exit 1 fi + if [ "${iconfig}" == "" ]; then iconfig=${iconfig_proc}; fi (( nevt = nevt*$xfac )) cat << EOF >> ${tmp} ${nevt} 1 1 ! Number of events and max and min iterations From 5b4237498d33016aa7660e59773d2973d62f6ce2 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 18:19:24 +0200 Subject: [PATCH 05/33] [tmad] in gg_ttgg.mad, add '-g' and keep '-O3' in make_opts to debug SIGFPE #855, and add volatile in aloha_functions.f to try to fix it The SIGFPE crash #855 does seem to disappear in ./tmad/madX.sh -ggttgg -iconfig 104 -makeclean However, there is now a DIFFERENT issue, an lhe file mismatch between fortran and cpp (#856) This is probably due to the iconfig/channel mapping issue reported by Olivier in #852 --- epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f | 2 +- epochX/cudacpp/gg_ttgg.mad/Source/make_opts | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f index 657387a586..7394e761fe 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index e4b87ee6ad..a32ac4b387 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -1,7 +1,8 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -GLOBAL_FLAG=-O3 -ffast-math -fbounds-check +###GLOBAL_FLAG=-O3 -ffast-math -fbounds-check +GLOBAL_FLAG=-O3 -g -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled From 07743dbe0c6f331cfb1052f738c75165adcb1155 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 18:28:38 +0200 Subject: [PATCH 06/33] [tmad] in susy_gg_t1t1.mad, add '-g' and keep '-O3' in make_opts to debug SIGFPE #855, and add volatile in aloha_functions.f to try to fix it The SIGFPE crash #855 does seem to disappear in ./tmad/madX.sh -susyggt1t1 -iconfig 2 -makeclean Then no cross section is printed also for this iconfig (same as #826 for iconfig 1), but this is a DIFFERENT issue --- .../cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f | 2 +- epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts index e4b87ee6ad..a32ac4b387 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts @@ -1,7 +1,8 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -GLOBAL_FLAG=-O3 -ffast-math -fbounds-check +###GLOBAL_FLAG=-O3 -ffast-math -fbounds-check +GLOBAL_FLAG=-O3 -g -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled From 8dace2fe416b971b76573bf29d983a61f406418c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 18:38:23 +0200 Subject: [PATCH 07/33] [tmad] in gg_ttgg.mad and susy_gg_t1t1.mad make_opts, remove -g again: note that SIGFPE #855 is still fixed because volatile has been added --- epochX/cudacpp/gg_ttgg.mad/Source/make_opts | 3 +-- epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index a32ac4b387..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -1,8 +1,7 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -###GLOBAL_FLAG=-O3 -ffast-math -fbounds-check -GLOBAL_FLAG=-O3 -g -ffast-math -fbounds-check +GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts index a32ac4b387..e4b87ee6ad 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts @@ -1,8 +1,7 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -###GLOBAL_FLAG=-O3 -ffast-math -fbounds-check -GLOBAL_FLAG=-O3 -g -ffast-math -fbounds-check +GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled From d98f939ffd304b491a0304de1b1522e1b9734950 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 18:41:53 +0200 Subject: [PATCH 08/33] [tmad] in gg_tt.mad, add volatile in aloha_functions.f to fix SIGFPE #855 and prepare codegen backport --- epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) From a6993cc782e0d7a06eac3a4ad3b83d297236558f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 18:46:37 +0200 Subject: [PATCH 09/33] [tmad] in CODEGEN, add volatile in aloha_functions.f to fix SIGFPE #855 in rotxxx The issue was observed and fixed in gg_ttgg (iconfig 104) and susy_gg_t1t1 (iconfig 2), the backport as usual is from gg_tt Note that aloha_functions.f is now added to the list of files to include when preparing patch.common ./CODEGEN/generateAndCompare.sh gg_tt --mad --nopatch git diff --no-ext-diff -R gg_tt.mad/Source/makefile gg_tt.mad/Source/dsample.f gg_tt.mad/Source/DHELAS/aloha_functions.f gg_tt.mad/Source/genps.inc gg_tt.mad/SubProcesses/makefile > CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common git diff --no-ext-diff -R gg_tt.mad/bin/internal/banner.py gg_tt.mad/bin/internal/gen_ximprove.py gg_tt.mad/bin/internal/madevent_interface.py >> CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common git diff --no-ext-diff -R gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f > CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 git checkout gg_tt.mad --- .../MG5aMC_patches/PROD/patch.common | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common index 3cfcc909d9..a144380912 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common @@ -1,3 +1,16 @@ +diff --git b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f +index 657387a58..d0ec1dbde 100644 +--- b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f ++++ a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f +@@ -1201,7 +1201,7 @@ c real prot(0:3) : four-momentum p in the rotated frame + c + implicit none + double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 +- ++ volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + double precision rZero, rOne + parameter( rZero = 0.0d0, rOne = 1.0d0 ) + diff --git b/epochX/cudacpp/gg_tt.mad/Source/genps.inc a/epochX/cudacpp/gg_tt.mad/Source/genps.inc index a59181c70..af7e0efbc 100644 --- b/epochX/cudacpp/gg_tt.mad/Source/genps.inc From 90c8760b1fd2428aac0aad5dc90ff95483e2c778 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 18:48:54 +0200 Subject: [PATCH 10/33] [tmad] regenerate gg_tt.mad, check that all is ok --- .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 4da158143b..92993ac924 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058248043060302734  +DEBUG: model prefixing takes 0.0058269500732421875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -203,7 +203,7 @@ ALOHA: aloha creates 2 routines in 0.154 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.139 s +ALOHA: aloha creates 4 routines in 0.140 s VVV1 FFV1 FFV1 @@ -221,8 +221,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -239,10 +241,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.797s -user 0m1.551s -sys 0m0.228s -Code generation completed in 1 seconds +real 0m2.091s +user 0m1.719s +sys 0m0.267s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * From 2883c5651990c7f07a22fa3868bacb1dd68cf72f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 2 Jun 2024 18:52:19 +0200 Subject: [PATCH 11/33] [tmad] in tmad/madX.sh, use iconfig=104 in ggttgg and iconfig=2 in susyggt1t1 to test #855 fix while still exposing #826 and #856 --- epochX/cudacpp/tmad/madX.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index b6d2113461..4f815e5b7b 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -274,6 +274,7 @@ function getinputfile() tmp=$tmpdir/input_ggttg elif [ "${ggttgg}" == "1" ]; then tmp=$tmpdir/input_ggttgg + iconfig_proc=104 # use iconfig=104 in ggttgg to check #855 SIGFPE fix (but issue #856 is pending: LHE color mismatch!) elif [ "${ggttggg}" == "1" ]; then tmp=$tmpdir/input_ggttggg elif [ "${gguu}" == "1" ]; then @@ -286,7 +287,7 @@ function getinputfile() tmp=$tmpdir/input_susyggtt elif [ "${susyggt1t1}" == "1" ]; then tmp=$tmpdir/input_susyggt1t1 - ###iconfig_proc=2 # try to use a different iconfig in susyggt1t1 (issue #826) + iconfig_proc=2 # use iconfig=2 in susyggt1t1 to check #855 SIGFPE fix (but issue #826 is pending: no cross section!) elif [ "${smeftggtttt}" == "1" ]; then tmp=$tmpdir/input_smeftggtttt else From cf2acef66551a2d28817504b914cabd289f0c079 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 1 Jun 2024 12:59:46 +0200 Subject: [PATCH 12/33] [susy2/ps2pdf] regenerate susy_gg_t1t1.mad after installing okular on my O/S - no change, no new files --- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 78d37d6c49..385b39add0 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.130 s +1 processes with 6 diagrams generated in 0.129 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -593,18 +593,18 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s -Wrote files for 16 helas calls in 0.117 s +Wrote files for 16 helas calls in 0.116 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.196 s +ALOHA: aloha creates 3 routines in 0.194 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.201 s +ALOHA: aloha creates 6 routines in 0.191 s VVV1 VSS1 VSS1 @@ -645,9 +645,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.081s -user 0m2.681s -sys 0m0.245s +real 0m2.892s +user 0m2.616s +sys 0m0.273s Code generation completed in 3 seconds ************************************************************ * * From 51e665bde02704d308e37ee8ab58144f3675abe6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 1 Jun 2024 13:01:12 +0200 Subject: [PATCH 13/33] [susy2/ps2pdf] regenerate susy_gg_t1t1.mad after installing ghostscript on my O/S - three new jpg/html files Changes to be committed: modified: susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt new file: susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/card.jpg new file: susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html new file: susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix11.jpg --- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 13 +++++++------ .../SubProcesses/P1_gg_t1t1x/card.jpg | Bin 0 -> 6238 bytes .../SubProcesses/P1_gg_t1t1x/diagrams.html | 10 ++++++++++ .../SubProcesses/P1_gg_t1t1x/matrix11.jpg | Bin 0 -> 43691 bytes 4 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/card.jpg create mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html create mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix11.jpg diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 385b39add0..ea2011c29e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.129 s +1 processes with 6 diagrams generated in 0.131 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -598,7 +598,7 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.194 s +ALOHA: aloha creates 3 routines in 0.192 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 @@ -623,6 +623,7 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc @@ -645,9 +646,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.892s -user 0m2.616s -sys 0m0.273s +real 0m3.107s +user 0m2.794s +sys 0m0.312s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/card.jpg b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/card.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1d72b64b3c4f4560608bc8fd8f140733a9e2e29d GIT binary patch literal 6238 zcmbVQ2UL^Wvfh6hNl1V|=q-={p$egerZjWU{~6d+nK(&56xvz-4a2FaZz* z0W;PIY|dgy3?rj`*0xqAj2+uq698bj083!3XHalB)7I3GwA;y<^uzX3+&#lW4Xmwq z{CxlSadix^A_4GF@!#kEPYc$|JIs^i^_8VZJVTk`06KG^(`V~FqWFWax6zic7J0MI`H0MGbk^EeMcO9B7_lfP_~O8{`j1JFF|8NodG zXCzj-06+pP-~>b<2*dyxP=O+#0ZpI-^ucyu4y=G3Z~}XQC-4J7fC(bO5fBR!K?=wK z*&q)Tf)Y>;YCt_`0qvjz+yR5&J{Sd$!8Di$ufZ}{1z%VPk`2N`JdhwH4oN}s5DlV3 zTcPdH4#*aAg4`fqC>V->jzA}%6etVIgNmSXs19m_I-q{&0rWdG11&=Dq0cZl3NRgSoRt0N@b-)H;qp)e%YuG0^fV0E-;1X~} zI2~>Tw}Lyvec(*^F?cdO2VM-XfnSIB!XLt?;cws@2o!<`K|&}ZbP#3;dxR(A0OA-T z6_JN1Lo_405f2eFh-JigBpxY(lt*eK8OYs8Ut}aQ5t)N5MK&ROkfX?XPvO@&csP_e7#v<4$2syjnmI-|-f?0$r8o^Z_i`TM z%;9X{9Oit(h2fIsGU9UQI>wdH)y6f>^@+ekP$5_mf(WUE3PL~OB{v(lH1~FHFYb8m z%iLYub3AY!GLI3DC(j9<5}qEOc_NA^Lu3&56H|!Q#3AA`FE_6muLExsZ$9r0-f2EK zpEMtXFOcsvUp?O#-)DYNegl4Q{v`ey{t^B)0bv1s0dIk1fm(r4fzN^@!R>+pf|-IX zf=`7ILUKYjLXkoRLOnungo(mB!k)s(!u7(FA}|p-5nGYNB9}$(ihL3k6EzhL6+JK7 zCHh8;PfTBIzt~x^n_`RNJmOo${l(9U-x7aC;w2f70!g`~Zql-Zh=iF$ghY|Vki-{B z8A*G|ILTVcr(`UdPWB<6C3ljSDdH4M$`MK>Wl{ zjDQS7=8#OK%o8e}x{Z2(T1pm9#pJUoKfOa+MyJu)S~om3uTM*mdq`^TfQr+D+el$q0C)`qr*cDQ!E_L7dGPJm8@&P!bxU2okI-8nr;J$Jn# zz3Hu_t!`T{ZJpjGvCVy3@wQogvc8voss4h2tigVRDuZ{1%7#qCCc`x&ZKI<`w~gV( zrpC#}L)*Ex+i%a?K4l_l;%icAvTUkudf4=~8Pd$cEYob9AjT!m*`RE!ZSrki*s9nb zvF+W(vuoe3ie0OAhIXgz9@|sxBkVgJI2~LZ${p4mjUBTbr+07JeRTJrlaQ0YQ>!zZ zvx9S~^QS%A_hj#xcTsmqbQyD%agB2A-z&5?aPN(MocrAOHMk+&9NjA1zPekv7rTG* zV0c{cSoSpXJm>k^OW*6P*P^$ccb4~pkDgDK&r4rD-!s08e%t(V{NDH*`RDns>^IwA zxPL9cD&T6sW}stWZ4f%hEvPM+J2)V?J47rbD&#?^d}w0m^a1Sy*$0-H7R=HxD9kmi zHJr#QigzQZ5eX492e%%~Ke!(05ZMq#hzgFn8!Z=|6uofB3g(d*(!fqp?vp=e=B;pa>KmnMo#i<*n2i*qicE+4%7qQs%3_X_<=d8tro zO6j+&!B?ltY|FaKHOtE@L@G{K!YU&w7pq*V?pGUEx7H}u6xH(8rd$KpBCfrvbE_Mx zx2W%Ipf}VsN;O_+;%Q25hMJ?Bms|W=rdyp_AGDdbbzaxG-t?Q&Z&%wT+VgMl-Z*^| zdo%v#=B-1wKHUzz{iegWW3F>w=i{#3U5~nLx`%o!diw5|-0A8y=)Kja*Vo>!-QPAq zA7~lW7;L_)ez)nK`n{$hjiKgY&EeMjTKBI%(0y=oWZOu`L*s{c9x)ybj_w?NFlIkC zG43)xJ>fO+^7p{sS0)cmet8`81oYr_vGbGlr>WJT)h}zw>w@cL8}yCd&yJrLzC?ay`%rY0_{2m+E@Ld5{Z zW8=;yd4NcZ%`8Hb4eL61Z67RCR6Uq+7%U&ZfS{0+v=bXzupI3ajIQJfyE1kK)?}fKX^f~$RETB2&9Axirc`3&HVt6BrO(AG|Vij>%fpzZI^jH zm=Cag6tx+tl^@jpVD|Tj#r?mS{gc=)UXuU^hgh2jCxC5W?K8DF2K%3?{1(g6$xUGQ zsA}(!ndOA<4Up9e!~7O;!8`;hoiC< zYY-~+B>5Ye7^$v0idT+8Pi_MjM^-ZR5{#uB+j?&DwYS}0s{TsmP0udhPdoWrd(yp< z>a@GUchfM6?KMm-+c{FR$==u%X<|g?r#9+J%^wHya%jbvLuV=D^xRX<9$>kWY5S#OplQn^#L+dc%A} z^JlZQmA|{4$S`=$Wza!{nE#;igU&CN(^^Ozo7$6-n=;nhFX100*`lZS`j7aaU>#Xo=O;(V7_QRJDXs2NAB4&?AJn=kt+&$YC{ zZOpNptUq-6OWR&)T)L>4mqaY?(sRN{`z`wiO|>UmT7<^Vmz(5;j|*S<5UWu~-rAj1 z>_2K1PX@qAt(6Zk_ASVQSPO+xMDb6W8kCI;$-ksuU0aSa-uWzsheJQBKJ5vi#uIZIb~ zJD1&c=i&K_v-U|Tu)YofowQqF#$a{|jqN*PS$4%+?oVetklYy^%AtHV@o3P{-yxafUgtQSu zuZ!dqd{yGgIo!Pu8f?NB`>bI#vqY%Jqn1G9%~bQdglZ@H*fmRo&kk8Z@yVN zmzqsct|kT9LQ&X9v3ixGIrcg2LZm}|bjD46Df0{iiR`n}l`yoNJ}C~3usf8K?OFM_ zihuRN(5YFo!O#bE1e=M-<0{(7C4g#g#&4y*VAWi{aQRtE;}T&b!*I&=qZW5pB6Xs7n3$lSLle`$-Re^C zN|-{16M>E0QUo=yfPS!F{YgLBCQvDdcBx+!(j zEylQ<+G~~BOt`A(J5BdvF8E9D6d)l+WW{w|lO?OhwAmaj<=A(zu6Iuh7f~;iX+4%U z!cjqUJeN^G2=9${n=XVIH+{~!A{hlhC=l@|`8(HY*+PNvp6FmDsdNEJ!}km_($1R3 zkcj~cYS6+l%6jNhS?|%JLOHclM))dPQ+v)xot$sFcVO+iz7!v9ys?fK!IW^i>o~Xb z_g8CO<2)u2qUs_p`^QxjY>2y4Fa^BKJ|moVo&F8M%LjK?6g12|_3hF5xCxMtk7OBp zr8|1j_R8q?HhcSBrlY-sS`{VZ&X|;aGfmto?i3YWloe3E^rGRM2Db$zrh_YGVD|*S z#4;{Ely^sMowbNH8^gP_%j>=6bwM>;n%^cKamvf+Yv~-Q$(PdEblA-l6+MUg+)_Io?QbsWO$$j$%fd&@6j ze%`@<9slQ&=)HcjM`&BZo1QzBM-g}#F2nc5y7w}ae>)}W#AIOG2DxYf_g*xtY&oqy z=%=1SMOJZNYio$&efx1?-&D7Ys?V&61ldEwW1Fhq@7m`23IPS`W4gN{#Ik29gB%7% z7+%c<^Q5SEkr)S(naZs*>^?Rxn#Xn#M%cOR9a0GYmYrr;{832eWaCrVGwSg3F?GUMABTNOd)iRNiL72CdbXq>K>%Vs%{5L1o)Z_BF`A&Wqhra_6a2WDhroGVrG|b zR9Qquf4dwOk`U)wD`ZkEl`5J>J|PX~R9!*_o!Ad9!8yD3qQ|e^D%b?wE%lhEJvW{l z^SnNFC#j8+y3yMw85wC?WcL1kao0KgB9{Tay4_)Q#S}*%na!mcR!{{1bQ>H<&iYmN zf9C(^SbNl)ugF)bYFOK3y+1B8;SO3?o2%=p=kU(#y$d#$(x`lg);MANsii}2u61!u zA^ds@bxygd-p}3^Dc4lA9T4q8JFiI?)3{s<7cwu*=hn?9(fg;v;lK^Bir3MWQ*UQR tg}X;g*24B~WD59vkR)uV;7s3_+i{3vO-!{EcCNDe&A;>|e;055{$CII!xaDk literal 0 HcmV?d00001 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html new file mode 100644 index 0000000000..58a29e1a9b --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html @@ -0,0 +1,10 @@ + + +Feynman Diagrams + + +

Postscript Diagrams for g g > t1 t1~ WEIGHTED<=2 @1
+Page 1 of 2
+ + + diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix11.jpg b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix11.jpg new file mode 100644 index 0000000000000000000000000000000000000000..672916a2d92f7765449bcddeb317d31c8eebf9a4 GIT binary patch literal 43691 zcmeFYbyQr-wm!O=ZfM-yrIFw+!IR)2NYJ3c3GN;&5L^QUm*5)Q3GNQT-61%^0wJ&W zJ@?4(?z7L?=Z*Wv8*ltpk78BNRaCE8tE%QV=ls^MxnFAlw!E~QGynpD09p74__cvb zE%)@PfwHQSwA^zk_yqt!#RlL90{pLyt+S)5%oA!Y?N`*l$^Lr0Hg>X?R91fe*YCeS zU(EpUo&aEq?f2LItpwG?)X5m$>KtA|;4;KI+0;m8MfDe!WlmH#T1h4_zfB+y2NB~lRJfH-q0os5*U<_CQ zHh?4G26zMhKrj#v!~h9E3Xlop0!2VMPzBTjEkFm*3k(6{z*k@qSOd0!Z@?*V1>C_W zBou@J!UK_js6g}}77#Z`5F`eY0zC(*g0w+~Aajr{$OYsL3Iv6N;y@{&Y)}!X64VIl z01bd9K=Yt=&_3uK^aBBmfR2EVK!L!3z=0r$Ac>%WpoXA_V2KQ$dkys$ag5% zDD)_gQIt?jP&`ngQF2k5P{vTUQSMN2P#IB0QPoi2p!%bJL@h_{L0v+IZP8wU(8g@I?PGTV=NRbdMrsSeJoF`M67D8F|0#u6l@0Ur`WHt-(sg? zH)79XU*h25aN#K7*x*Fq6yprw?BPOj8E~a=O>p1ge!}g--NXaq(cwMCGsX+V%f;)# z+r~%2XT+Doe}f;6Uxq)5e@cK$z)PS(;7*WA&`PjI2qt78lq0kzj3%rioFV){L_s7; zWJ(lDR8BNWbWKb~EJ18a97bG0{FV3z2{nlni4{pKNj=FjDTI`nRE5-yG=sE<^pFgn zOqk4=ER3v*Y=InsoS9se+>87Z`7rq<1tovzRCrWk zRF+g9sJ>7gP!mu~Qd?7hr0%9Zr6H%0rE#IjrWvKVqh+L3rwyR3pk1azr4ytxqx(SD zNp}jPf+@nhVI{BydK7v=dJFm_`ab$=21W)=hW8Bh410{kjB<>gj3taqOc+e!Ob$#r zOw*5$9tl3OdX)ZXoEd?ckJ*Abm3fQ>frX#Nk|mvGf)$Zfh}D)ghjo?>olTO>m92zr zot=>VIeP$mJ^L{SjN=tYEJq*511BG+HD@m8A{Q=~JXZi$6W2L6Gq*8!D)%%G29Gq4 zFHZx{87~X3DQ^bv93LK^BHw$yPQIV~0{o8r<^1~sFabk>G=W(`d_iTw2*G|KM4=}_ zzCtZRw~zTBJ3p>^d?L&yY$aSQyeGmaVk(j=vMEX{`dTzwbWMy}%up;#Y)za-{Iz(F z_=W_Xgo#AH#IEEc$v2XvlE+UtpEy3LeRBQu@l&6t?a#o^WS)gR8;!A{uHM78~BbR(hTD`qD_wD9z~9_^EM{@i!9*lMg2Q zrlO{Crn_dMW^rbF=A!2D=KB`n7Ks*zmQO8HEYIG^zR7%ZZKY&YVD-yd)4IY2*~ZYO z(H7U%%C_5%($3v(!k)!G*nZ97u|tBxsiT5pz7xnv&#A!~A8r(fU6@>gT-II1TvJ_d z+%(*(-ErJ)-G@DxJwiQpJ*7SKydYl2UY*`F-u~X}K9WA!Z$WR3-gf%Z`M&er_LK1| z^hfo#@*fG{42TQ(9;g}E^p4`4-@DBqnV^zj%wWgh+4sWlGeaODmLa2|JfX><4`C)@ zgW;UviQ)GV#u0;&T#+9me?^%^jYjiFXT%`I*v8DnipLhl;lz2yZN@9c*L|S=5cc7F zf-nErd_9-q)%o@WRz!8WQJ${ z$g<3u%a+Nm%VEe#{Dkz$_0vwSMs8o8U|vx^X?}SA&jQ7lPLzD~@j&&$2En_by zu8^HYBoorCRf z_TTOfgO6~I(vF#qYfdCihEKIlx6d5Uew{~LkX#ge=l$M!seHL|Wpj0R9d<)uFc<*>Mnr_b@eIfZejR||AmY++Nh0B?yoS;`;B))OW+T%*sq7$7oj8T@ z7&!)@pb`=hlaSIgFfu)2=H=rT5EOd+^qG{jjI5meOEq;3O*n8eHZe6bx3GNUOWG#H5eODLJ2V^YRM{i;AnNYijH28ycHBySjUN`}zk4C#R;r z&dkouFRX8DZf)=E?(H9(onL&vyt=-*{qdV#AOQRqwf?5r|IiBut``CX0){|;(+h;) z_M7535JVa-BwR@q=xYZ&T5f-2{3o&5l^rN_JgTPzMvfDxgfQN9`m^6u`!CJ@oMHk0 zuQdCcVt?0b0YC?X;13Us1Be6HKbUg-Q2%fFzhm(K*Ez6r|9XR@MHDk}LU+He$iwSO z+t!8X5kB+lgzH^87&kk9Z1eFp+=eIUZ}hGld&w0-o+^zm_! zptR%YE>qzXCiC(V5PAJeAr#dbT6k&1Yrs|kz+blIx&dYJnqqxe}!IIzuy zk<{L)+hcZyF=ZkAtyK8<*cE6GxVSJTp&H`iSHfZl)TI@;U_chG9DtxGlYgLm;CckJ zf~iXH2pw$$au7cRm3Ejp1Y43ifJ9Ce1!$DQR|fqr7N~fI#X@gU;rxhsu~vtr&D~g) zo|eeL1vnu#Y&dz*PhGa%E;SKZGw{pJNv9g7c%P*r{ISsatgmV zG2P6V*k=31Zwc$ZxNRumC0?yBy9n#ue#~SS{mgMm+2Pgt^qNla`4KVez;t|jpSPda zJG?FzJA$5;$`ME*r4XIr;DNLx#9oqoC$}O-gk!S;+nezWGcUK-u%$NaD7(4F%$V%4 z*o^xaqE--{s}S*E8oH18x;UwYKHpTzg;eu}cUg|Rp1d3yXFqzu83~E1WQQOxXk0}K zANVsxL%()Ik*KQ(?Hg%cUPyYU)b1dD<5Efcfdp%#C_Z@_rWIB91D_4)I`2<1>E<(} z($T`Jndi+m;RK6U?{^2#OXU-Zg_R_DRQ4#Li8JRmGCidA_pqke1Xx|l!2}NLm47sY z*fVQc#AJu`l3r<9r1wF!RJaE!`a>IRNElSO{#K~^Y+WA~xU~uwud4&Fys^0&n^6C8 z?l5JcdX^H+zS8R^H9d3{h+~*5*T)ik=G$6LjTf=WNYX1wPQ`QP)=hI-B2zP|J9qG^ zc`zQ$-7sbO?eK8VmYI3Hcf{bW(Jkv;h@fz*QOlQ2XczBU;>U%$R6CJ2nz~S$`bb7K zxtGI`;MKlC(kGmn6+&yuP>(G4XxXzDiA{;&J`~H@j;YfG%w2`1hNv0uA(+Ub$Yt`k zJRSN1(+{{e0X96!7mW1+=qZWmWPP;V61s9^C9ylvAGuG(lu^c(eq_pjwRRa{Uwt{n zZroVipm-*qPVCDm*-w3rOuZKNAI)tj(Wa=84Ug(nOS*<2Oefq`hu@e?i6jXRA3{~r z5Y0@ZFa2}QoC~z@C)U0FTj3jn{F^RmD;7G-Ph)HcE4oWvCZ$uKwV`7Hc~K^eu#AV= zi~N#L?gi_8Aube3@`{?zD{H4oM;ES?jqjeEm`U~{sME5hArye=VlykhQFUh!PPFJ1 z$h8$Di!Z*mgHN~zTL{6B_@OQG;4y$oYC@$$L_p(AS7(TPT;6{( zjvu`|*92Pzf;0L?9$?A|`+X_)j4CnC&1+s=>H``YqGl+TO%fL0bS8K_w=z1^Wacm z3wBq4N!#f|(PqVwqCk&prr~(@G@TW(_3;-EFMN-Iw1{Z70S#3C&vesBqCLRt8>@~V z(7>bJK*=fNYPltmTw!4$FqpMF4bz^-KmNeVF^mbw`F#QRJ+OGQJxtcx=yWXbuVOFj z{Q~la5NG4th(c$c2is?IF_VGW;?uTfGrTEoJ_!V2^d^<^Ej>`F;Ub0Mam*#>s+8EFy^t%e=hI09F5&-3?>>UWQ? z@fSeop0V1M;@Df=&gXSBZmChV^)r7fI@bR2BcrTJYK$(Ji87)u@lF2b)9O2hP~)Fv z8)T6sHm}JLl)0E0K9~@^!wFbIUo&-zaTjj)Gzj=oV^Z1B*4#90io2|5ptvJPL((e? zxR?p`${EMGcdUydt}Snf+9_1looRf+jeMe@VWv#6(wjM2VGpH3F^?!^Y+^3u7c}CD^&PeRL!*{iAcgXOhp| zbagfbXRnhOw@g#l8znkLM#O($4W)+J>qNF630ez4W_+U_i1yn!CsGFc7c4G@JnNGL zIOkb-4+$kN){2aiQ3e>llzHzL@2xz#aoC`cc{v%cKE-O|f>d4C`hoaiym>yx5lD~? zX9SAc`?pNM2jb9!sW%v854@(U;)b-$ZC zig)x_udPqq*w9)JfN{X$IMt=VusC-B(@9sXqU5*5IqS`{{-t*B#NaYQTlFu%Njcb*oy7l8U+Wc zTC5N@=nNe_?RHyTm?aW$SYBKh&R&}e16LYCe72WlJ|ZNx!@y|=mXwrJ&N9s@qYHe^ ztuArryGA8T{X}G|7GTCW3BdH@CaY7C1R}o59~T)B>}8!HI|qCxNoM>wA$;zp*Z^b^ zY#HJrrYx*vF@bwA157=|4K!rlIp^kYAv96QbQT%Q-7iSgiME&)1@LAC0ZavFU&z}> ziv#ACYe%*Q%p2m%``4;Mw8~1@m5jk8ombR|YyQnk9Hp$6!a}SkN7T)=@nq{6T(tbi zbjSFUMz0!scN9SYmg|-Uu1Uo9!lA*!f9O~mf=s7F~M2FScgjvg9#NN75a;GTo;`Ufj zfeut1@SN~g3rW+EK)!BAIoDP=rR0WP_=GUwETfEXNh2kw2~838^+&cRvLvQkSX}e5 zCYlS#7=E$~G9G_CMgC~|7~jNqE(nCz>>ctAiPtXlF(tnoaX($d##9cRh~Cw~SBCyC z8fGe~*`Ry*8wU2W<4YyFQ79sG<@@j-TkB| zudUX4YrqarNN(fe5FkmR z=30>`B9(3R1^VEUoU$Ja`CRrhy{ZP-*G+_q=b^lu#m}z&ALzPCpG(G0>MA74}Ss8 zQRZsAqNi+qmiz;F|oSdyV`%mzxa&Ljq zmh?$`p9_m#WGFw3shqm>o?hi`v zw+=~#;&QAoKgn#-Jop8my4TB|Wkm2Pl3K$2#FJC5B?LG|q$X6?Ni8AWEJCfy)j!+6z_8E^y?MDI6Hvouu9j$r zEAunM=IHSI-a7%@8s#TbHZLEdK7ve?LEHjR_9ad2UjSPD`tDD3@mqR_UqJ63c$}$1 z{1*_kczW>=Om+WT2y4^%_cwZR+al?(>5H#xK}ppdz$PHwjAONPJT1Nu6HNI`ejZUy z4rc%@3N?{rMCdK`+`VLLX6IXs>Erc`je(tb?O=2?2?}WhS(Q*j`D1;xs@M$q>lFS- z^K8ojbF+*{-eG960|Q`bYKU}822@kZWufqvG50;4Jo-Muy#UXeJI~|GO~_op(&Ch0 zaWCXbKKu)S_ea+IALRt(f3$j9ur}QKlx}{Gi_rc8zK<&E!G4@z`-%C>&73G&*9cE#yZnG%TQ~9w2{)`%-eo+!n}iJ;eINc66uw8^ zBTr-rx?k7JX9o}pt_`Gd?P0;zDC0XaQEMwmtpp4f163;(QfYjUieL;DJ8l+pxZUeR zbd|kplHbO|ei(GM5l(rFnwnI;S`f=d{Gj1@fM#99`i+$Neq^ApzXjL9e_w)JFVTOa z0?FM=@vHN|Zc|jBe1LETnhd_AX@c@?5WTYfVtTG1`sh$Zj2h-<*9xrqj3POac( zdI16>5b@|Z6Zmrue7az{cY5%3tt(-7Y`Z5iwOsSNPNvFz5~A5~io?e!?zxJ*RY^Jg zr*8E>bG82_tN0+s1k_IhWhR-6ATTwh6Dm3tUoTkrDxy-lXsP)_ z>L|BT(njR1Rc<(wUXM0^bi;jslGu(P*FEE@KsRYyw67a`9%6dga={O@9v*^bvQKgR zE{mD;exqcVrT8Bu?NODbSSU9g{b&ELP|!rcxdI2KB)QYc z2W9B!`zxO_7Ry31)rp9Z!y-eIhP%hdA#^XOPEZt>EA^V?RPO0abqwZz0Xjspi1$w+ zv3-`TuCrm2eGXilrIzS^$7(_Ln%e_R^55yq$Ken-_C{fy;5w*^EX+fc4K>$duTmYfgaHnxj2vU;TT!HlJ3(FF-nDW@EvCVPnCkdBdl84L4%YF)H%B@@`I$ z+7Fbi^}u*0mb6ZRX=9+7IanVZq`tlFG=c4A9C(W>3&alGOW#!Ix>K%f3@&fBoB>kj zhwb#nODGt0a0Hv)xa8OhS`u@|zNy&w1%%b2<#H__SlaEDHO5GnV7nNwi}nFnJc2ND zbU9WJVEQKLOhNDLuzsdaLA3ro?IKO>Gf094$RzoF3HN>D0%pemS+w2h;##h%hi7>k zs|y?6%gLNLrmB5jNAUzkYnxsYHx`q?aU|O2vG)dOHLFn3y1H;Tq%yH&iq`Gw=M}iJ z8ay<*2u`g$*CAV*5ylft2(3YpZOH{XTH1Zv3k%&bovEP%R^B&>5w_ z@kS=ribYdgzYTi~%dJiAhX4{SD&=-cN5L(e9;cJC1AUB!7D}4)v3I4`S$ASp=e+eu z&vQ-9RTACIgzK68pt^Am3u;asaBrvX8b^i~H(QP__9;sdF?C=E1P&XiGssHZ%h|T0 zEa)Z92FQ>!Rk@Mr0+4t4=p8(xQcG@BH`ULOLS{YlK30VZdh@=6SxhA8jl;7P(qz|f zzHnU%OYHF~VedEnY`bImIJ_5PYeO}pw)V^>{>Dq@YlU&fyo<2xF#%nGTml;H z=O=p&OA3g}_Ie?xD1up-0sj{2vQTz)CwfMev1j5tm0$@EBurap4#ddtFX2k}85Y>< z?y@vENstk2`jR{_Ebl!M2>y=pDO{O7Mc zgl3yw9$AI-A0dw1Ak{{#{*EDBP~~a;y53lri!5J7-F~%2)7ygW7p3JpTQaq$g@Yg& zM46EQkdQv-+k*elJA)E!h+@>(m|gTWDc0o`Sz*tk8RyG5<_L$>BS@e}s6~Hy*DL>> zKI%ALddB*4S%`ymb;2wt197(wl4V(@)eXbNYm{5ns0Kfd|p%H zAtp`DOfzSjV%DZ1%3;sT+qcr^h=~_Kjf@ezdu>x6IlfF>YSkpd7xe4o-1F0=@6p#K z7ir*o;i~3Zq_X_lyE*o33p<>zluOUsjH+o|5S?P=OE@VyX@UgsYVmeT=8ljp)NJ4P2IJ`C#joUSorf%ZIYuzEma$+YPY|1y)Q8GSEtMv&B{ggsQ&Rm`OP0w8JsPLQ;A0=0Y#9sdPXIcJJyTfVY{rC*Cg*>l?i;sae?fe-_;M zn+&w$#5UGhGU05yY4E;39OC*2uceN=f0Nr5d3e9sfj-T*lp5g)i|4aHN;-$G1s|;F zc!hjQK~c_)?axk_j=+{&3lQwFKV4B=Gr;}Gw``|6T_z$|^m*#~RXkxP9j*o*!-oB0 zf*Q!bG74C-0Jr>p?fwBC{*>WvW3%1RkO+@rp$)(y;8CmrSOx%2oAtkcakKvmplX7? zGk8cID9O##+rWm7qwsvI@tMuSyx0)vNvCVSx+c^(q1X*U;7Bq9u~4qbS`x#@>1cE_ zVGnctqWyi3FVVevp&m`t$x8)aQJdiaKCQswTZdyoz$3YXPS8lsc(gB2jKE5tcgFTh zJyM=g691DQ7^ z)s)GEV(Y=e{Fn>#t>*f01d!EaaP+{_&3*@ zH-TqJ&PQUxm1ryo1X}$3xw)0Xqesn^ad6g$ThUF|#%`#I54S>dc)#j; zPNlYSD4hQkv_R$f9ww_&l@&uo@89{8==2xRPU`0Gy#Fvlx%lm8oW4y$2A?6;0O7HB zE$f||X^1$NXw?J3y+F;WS*W>}hcUT6ED?@K;iM-r?dx~Ft|owA0)^UI?7;UwNDco( zIET6QXJlJJf$p5X%+9OjXQ#udAxja+uVefB8S}K@8>~3Wn9sVxoCuMRsq2DX&r9j%U$wH z1?xOA>+JVCHDh0f5?Z+;z=SRWWLpBoKSqJw*qb z4L!H@+2mgUO6%c|m1L2kD<6%a7bsWix9Sg*B|%b4L-n>h;sRB(x$yM__wjT(QUn5N zTi?Sud2GjdxHVYz=P(l{gAZJSqufd_Ga68l3?eTT<)@S<%E~g17bX=JmAEa8j2MjF zP(7X>AU~TkVQV(qFhtwA@MgN)PXAOob^{A4vu>GkJgXQ=D;re0 zt*s$q14#=e1_whc4?qCbV{g?Pe~xJO4;^+sRymRu)!JQ7+)sA~b`CA;oOXQ4U#o|b zX;%R*6*@N_r(Q~@-|F%zEElX7y4@;7KP`;#^gv8C-vq@MX)H+^cH29}_8N4Z_sE%L;ArHnnQ)Hlmy}0$~&Ewjy#iR`cf)Apf1-fl_T!K-IopDpGFD zFtOX%upi}ei7b-h)^iCD;|EX#&@pYJmdHikT}5^No+$ChX!Q>*+l;puY{Gm3vnC}k zYqG1#r*a#GP<_~Z#~R(17@ECF(w~JP&zweDm+Im`(At)5$$P#SUbq=$YUVE29s@Wk zX9zeK&gxC9szkrg)99(lTisqnd&e!-IXI`gwe~4+PrVfbuu##qze$p8{EC@`RG&Xo z2sPmkb&g~*9bNxSzGb*&96?%%RATH>tZx~k+eg10CwPhujaGO{Mx|YzhXSpV$mCWo;RFecsbS?#MpzA7sfam?&{P8$&nX%y zp6keqR4g~P;+Jwxx&iG-dhKwlH@0+>yP)tI{Rd$&J9|!wn?E550ZR||1KFjQBNP(<8Yh$N~Vz|yhyv)EhSE&r!-J^ZtY*clNk!+7YIe{d zu6-9ercB{w&V@l`ZGCMDf86zv+)jI$>A>-WesTX)C~EhmM5eiaM;8)(eM~{HscNB{?d!@b zKU)>uxwCf$hF>pPsR6iWS?^-`NTv`?wtSbE!{a6D4KM5O(H>ix(c`wgKr$l$F^VKnwy^A_Dwff`N zObS1e-tWu*hT8GBY4ext0LhP;?1*PdMAd_cO`uBextn?vr;v^LHG@YxwK2_%avV0J+c)cFLte+kMJz_w8%O;?GZmi*h`Q@~u8gQXDf(YnKWP@GY8; zHbf-z_l}+4=jq`WcCN*Ep)!Hv>rNxTyP-177!$_mFR#8rvl5N(VcoISUS1|=YVgJg zggp{Lz!1NYliT}FbZ`nPth^7&UVSNml;-5GM$kVa_)aCU`rF=LVkrql;}ni5p0A)A1x$e zj25*K2J}tTqz(zgxkzfQ0p}0jhN6c9=GsG1!Bw1jtDi*58qzX(C;+h0zp;Jg>~;u> z>?Qb{>XFuK4PGHMcaOM{;;zUBz+(5DY$)oT5lFiZ-SxOK zuyr67KIcZFWL4$BqE!wDZ{+_Q=KZvL{xCPOTczC+nra=RRPPqg&FQ|i7~5=BTsPIx ziv^h#5D2U-Syr|pdZ4aJRjYBaVSY9xFJ*!W`Bpu`AjuqA;AHB-EMq$v25j$)i%9&4 zOe{>fj9y#|b2k{7BFUbA^llydd1=GBYzrr_Z5qzkWj}n)`fByM;&Lt`2eFf}AH2 z+mExjFi!$t!aupEDc18TP^zCbtRH7TUa<Ua_B{v1d63r9I*)^G1&DOKE|7eEjSLQBFc5mqQ@z=8?8+Wdx1+! z3b?dA$sY<)^O@ zpK2)f?K%Hr&i;Opu-$vb_3#mMwW*3SJ8GqB1!yw<8QxuOW3c)8yzIirR=FD#YUE$d zDhq{=`9EN7)0?OV3XZPpqbNNqSQ^8mFs5m@2VJqEf4iGLW zTQv{b0&OpFFRGo-k*7Hq|JX_XS)0AGA8hx?Hpe14>=;oAQOg83*MywF8ZY=|Fo0|6 zSr%1mSpUOj*i9qqIF#4B1m7_-aLM*%|Jo%9`Pad|)81r{T#Njf2MdNKtLBpTafr;3 zJ7cgwR5`l$N-zIO1op2s+WYK`uInv*(nexQ#jCkhomcY<^c!zn{;EUQ-F#;oe(woCL@KS2Q>68=#48W{gAtzTHJ%S-yYes(KT}LZLb%XA=C>_L*jA%_onDPg^XtgM}bBd?7kW z4A!T;a`CQJycV_%62)QzU-JVwj+h^Ba}is<(HLd(#|K+Sg~^csYiXKtCHjg57PGT; zS8@7Ec6PAJMVa&|?jjl_kPyr```h?9;vh@rX~vhItyaDd$;LicEMKa9>bgQ~#a+l# zKiK*-T=3&GHa>A5g-ZUPFkMC@5~HOmm;1$DP$JV{up@MtEQCZh{*;G(J;Fpl|m%j1}UT&x{?f#WT9Ga3Xd4Pv=Q$5LGV3xs35h(#^@MK5zZqWjU~eK zXf+tT1M!*UR#U=o57ca$;A8XZ=9$;iI<^YOAk>JOT<^|~seIj-WVkRmKkB|XdfW*| zET`8&&K#)Psd`w?0|xo;wm}>WD0EI~)y59Ja=rF~W&bl+)3Um0n^1W0fE0isu8^s! zHcbvFAGqg^bZe@Cf*KNIj|;fDp&dJ@6bG%e|JAv z9G49vWc(}&t$Nd>9@Ut)WSNp)CwCK`~Zj{_`s88@Sg7s{{BR# z4#umQ6Dg17Puja=N6-B~SmQ=G)lP((DrcR0;_W92TzRIaxg`?W>n#kV$4gSjl0gE4 z$;q*0)+Gp;%3mlEj9^8*9^d8VKM7(aJl2kJYsel)=9{2H)ugw?OpixyDR}X zX{N=1Jr*7C>5F^>spam_%O5H8yYHWO5v=O`)UmnKq6u%_@%(h@HZ*!$Ip?gad*!N?0%kX~L zop>D1m0(Go5%F42C+t#fO4Y1M|31y;WMjoW^f+dn{{xiz#8lrNLF!XNKuLO5#W)2CebvDn;~K?J;m6|144;R7omw&=1qMX zTbmfx-&R0fUM?CNySp7HSoeL2@wv$-f3w7Zfl0rkiN8|5X6a$>olc4A3K}iqQ6hdo; z_;C&1*KVe)`gt3#i&FaHTWL7uZ7vrQp^2x`Hy>kN1;@f}wd(Nkn(0%8bzU1{RqR!K z#qi#`C^W^CP4AV2duN-WlLJTXbW!=^N~_TNifD z(n>xWf!K4ZUdx?RY8Cqwe(Kp@cwW`RrHAjVk0MJ4p>)aBLpF>&(6Wj}2zf>Zzs#hqr zTp>NpFLv=_{h6WnVp_`FsZL#bp2lT4fpw_ApX90t>m6U2v4+AJ&P|wH&2^Op!vt$g zm#x0+Gso?YRa$c|_BGTvPH9XDem)Lj3sS}7RCA9a>FNVF3@eMmNa{Fs`-o)aCo#kL zTM^_xNVm6UynQ!Tv6kxsbNbdpAI5WLb~U{RSgG4Kx2+ij1`E)UX-2Vt+1FX_PQO{U zCfjghxMg#^!%t-GD>5@upbUg36)Au+d0=bzlpw00SpN;_(eg^ZopoCu--n9uXV4+m z@xCExg=b}5aacnA5a&dGLQm5)S69NASFifgV>*Z`{G-qJnIV0Ff-giE)s|x-7tg!j%?**+@)G9mSHH<0nGZBVIkUdNA|SBBhtc8_rx% zDiT%@i85g|?I(7nsr-Mad&_`0yEJRJaCdhSia>C8Cs+vX?m-K8hhQOS5sKgtJh;0% z1Sb#*clY3uQ}3LfKK=Fd+tV}s{W-r%ct5^H@kuAx3g&WY$|6B)ias7_ z`E%|P&{Pb<*W3CE926PH>I9UusRXaa-ggWq|@Jwh-}1+I0}7zv@~1r^*-A(^v0SM@iCHUwzzk34s_U z)_-wJ$o~bj#GDKIB!+uy-cyupJOS~J zUo-hA-T-3xwZGLdLq@%Gu>FQZj`HZcp^Q@-5|)pLIw)&bj4@4_)$Xp%YQ24{sW#74r1R=3djG%A3n0;;u!I!ALjJ zm2PI6zXA9Lhn^Y-+F9hm#&w<~d4V4JN4aHbYiSU8T-2r&F)$4+r;BpQ85W zIKqe(#Ud2QMIN9voHC)AtV>j`xWbs7daU$7U&z};vUQH9b?fr|4-5)#+8D)cKK=@$ zZf$x20YJ=dEEacH8ggp+U`2UZiqQ08L8~2!22V|srtS@8X6{HaQZVAyea5fPaJ%Q; z;{9UuXIIzhn?ZO!fdLB;&YbE<2YU1`MyY|`zX&Y@_f}h>-?PkLL-R<-bvZ148BR*OhA>16stPsfsCIExS&7_W z!l6_H=Cg0pyUAt4?MuD0d&dg*I)JG+u?A`xU%uMXYrJ^c*^=t22_?C$;()bRWN6}f z+ZE+H!1gLw`{%D7js+0gvaIo8k!AMSTyhRnD?y|v+VfB3a=r1G`)MXW9eHlATpj6K z{9^TpcXS}{oLqHH@rfk6$e?J$!=hGyHE;Y=TgyKXIN-8N_VmRXw}b3$wfUcw{X6Ry z^IN%NHk}2=QB)+2O*Vpp?~P7UOEU3y+&i+g)25*Z0T9oMix&xBWwY6b(ou{#_fW54 zlF_!3;`(aTBVO6wNcDjdQh8en6jGE|u}beH@L5h!s~cCgNk`l#9bvaSzDJz&(LVoJ zkG->@e0}_oqR#e}_p>c9;Vp^R$TuHPT4WSkbk8Rf!__Deg!Lf8f*O1BcWwo>Bj-%= z=Sa4?Jds;g!|%y}H^V132=Qn0eKbUmvamoiL!>)~68#h=-+k=zSsHH!=|urN^7tSN zHu@Goa;HVn1g>B_ugI?B)enYw4`ybCa=CQ!euo~Z#q=R#enKaf3rT>bB|MwAfYG2+ zslk$+K4^>hK4WrW&{PAI*%FTC*sY1=PKh^6MwzzH2%nXnzu&^!YOwDcP!NN;4Heea zre_W2k_zSyMh*x!{F|=+-?mKuH_w?foFQux#BLl$AeTcH8;KUPz)Q&)=MHqU%ZS()5Y?>`PGf|EG-bZ#+_VhE4-v zvA4*PNrq{DdoPw1_qbIDrxbpSHcaMgjO5J^Ml#N@jyiZue4Xuq@^UmAe}$}V?~Nbu zj-P2g-8GJ5&{lkzXz6bRpT8A<{+G`UJNp!U93Rn2B(YACt>}Dov$at{N%*;Uy`}7; z$dX%ekMnA&mG zURT)(Gb5zAM52{Er2UqTj~h^O0mn?bM9mYp1gcIy!~_IfKCo&{0@lAvYY`Mj4Jl%X zqP|q0Dp2rPxaR#Bi|DsN^fJY+!g@(Ua1?)p%ex|ULaTa~jjjBc%>rVev%pn(mXUZH zJ93zq*i7YH3fI36CDd<^D<`C;dLB;0Au8YVYw&tdl;csp;e?o^_@bziY#(rk+Dj{( zJs0uR7B-t%fY3e-H-MNmQnAXIM$}0|2gzccb83-Ci2$*=S^!~mSZ!GOvuv}GpJ0h= zt8S)a*Aoan%G?xE+zqw&hKl4PDzrc@*pt{PsHC)`p!4|vyPczP_WpK-vA;xXa_8& zM2EDRe62!v;@@;BQh8^4*txskTK^9dhyUw!9iYR0j`uRDD-Eoaqf#mBU$$Q*zjqmU zB}9l6A$Egs8^By#VX{HjPZPq+kZzJK8RMfjn-jad{&e7uDoy#cf^+Vx`gh{&|89Pw zu&>p9gatV+>d=10LYGJRGwB)^DYnOYP7%8Nmw zBWfNQDt`l<^jCia3UyZlqEC-jK~)XJ>OLrBLv&v z?tZ+));yeQQi)MJfpOHqxI$JA9c{~ZKK1Z7pbk6S+6SvMvg|jYL9_2-j3+6bM=2*W z@B6!2Q)&+GWCRxh0Cl%vcC;AqR-*^|%&w&@$U*b{10`UTO)3Ii>MSgRbVZZ$YCy?< zYUA{y5Nt?4iW8D_yZ~!9Hzl69HyzE9uAxiD<71}co``S`y&ezL7L`ppqw7myP@VC> zusOnGU0CbjmgaF=$`51^BmJL zR&b#m{{@99v9e}$?+L&nc16!WOW=GU5;>p7aISo-HJKrt+#oE@qt^-V+wsS@@4n?& zpLB3=pfulFugOL7P%Ja3H?_33&hp$`V!XcinkI2Hbd=4BTFAB5^1;P~jY-dZit1bNjom7wIbT_OSIv_|IWzYZP*mI#pb z`_dD_W!qQm-9TgRJ+ewITmq0!g`gB+K6FsehHVr +6{xp3F}o8D8b!~6Gn)~_;= zKBvWr&G=X$x{Ti@S_%df30flV3f}+VF1Vd}^8>u8GDkO8r|Xa=el+?+v&j}8m+qyr zn3oR;%S13j77i)QAyvUPu|df}^>V)NUTY^UiYiE(Jy-QP&UG<6sG*pC?G4(E|4tSAkog>UH76 zSe5P;s;8|is~>aOPCOHJSmRSA79m~33>rX-)Z=ZxTgM=1(KgmEz|#1C>a7+Bct^N(M_-yMhNnHe?WO zT2nNM*Kt0A0^Mtg(SToXC+0lH*n6RFs|Pn{dioWzci9wCh{tL_(gEwT3WM++{&UYl zhiL`Rql!b1yolN=Og^0b#Q|T<)2FDS;NVD&;h6ePs65XD$Mf&&D;d0 zbf(#cVmQ#^ika=Et;V&1Hfacc11uNp)$Yz0L>jWs;+B*!xjEt|1WCE%w+RPuk9#+Z z##(D<#I3Q|Smo4(Amt+0_*N`nq6Hpl#_omZeX~d@z68nT?)3m^$upJ6M?vc%X{*An z8QXKx@{^rdV-Lh?rxmXg zM1)Kan5$rRD+Zq*tEU~0&EIx?^pikkUVNXgp{K^MPXGwL*K7B+mugi6v^&Mpu5{P| zEGG?x^%DX=p0oG*c4V?{|CqRDNcN&njnoAT+hQh$vB9T^@)wJCA23&yJ*h}dU3D@) z7`(0Syl~ros#vC9$grje8E_WzMc+E{M=IIZzF&IAIpIZnicD?MMJf6r8ysu6%6|OOqI8R^wa9|_a@QS1NA+m7bitSJKqRc>u zab#}(>Rl>zPKm3Sf8cEBbIx-<2nO@m?QlICn1!Aqxyo`&P!MX<#8k4^og&$L-COl} z-as1ECkQJ^z7T$eJTxN+ppXrrAci8NS^4-s^Ukp_G{qim)YN7{=|9=Vv?^CN*C^s! zPzPgTS_5G=oBvT4*VT8mZx!j*ck}2zK8lq%;3Sq(xX5NfnN%Q#QwnW>f(j=$2fOA^ zcmMOj_StpBmBo6BG!gEkYhFHKUIgzLlFi?+jGL)l;O6t`*7%o+IZ*mM^@%aw7HpKu zPGhN`_7OmVV0}(BL?=>rrFE)gbIexVaKxlwIr?Cf_JeFn5h^opkVDpkZmRzb?F{H9 zs?UOD@k_B)mqoK8=4-(L?hW#ye%63CQ5WJw9&)XY++qc~JCfS5Kt7gbT@_v?c{0bn%!(2GXAoMprtn2shI%T$dUw zTRDFqHvohmr($_U_%|bfH=_O?glS>3|O|eq8#2`p%S#CBg|?LUEaj zO{FcJ`C`bgM;H-sdW~@=MQEHv$g$I@(v;V?Uo0#Pk&?X!uDOL|U}kF=eExquO8;4R zAvyb|^el6_x88~{`*^MUjV^{YL3ULj5~LNs0iO%c;LT<4G>cK3%%V>jbFkU<=lnsr zR3@52Z0RNY&M!YvP@tq%T3Nhg(>-F2E%+I(5d)VL`Xe{rdsY62lSAFU_Nmg3N>=C& zE5BNBuQ;S)IK!3uf)gFvGiH-$T1$ADX;963yQLv^_aji=nuet+(x`HKh+{~8lSvY0 zkkdiV#<>#3QT+7o%hH^>-d4!{ZkKkAb&|q2cexj5@?FC^}!q;2ZRoLpQs7T0o5^5_|7b0t5e&xy~ zpsGSQ7tG68Ndw#6*ifO5Ur+?$==}yblziMyvt{73NtYCs`;z*;OT^Tg334aR8dgY zv%`!D&kk(4>%+X#3Nx})V(92G}`uJdZt( zwLSn>p{52_amV>ibQObl%A_DcHKXu9#WVfqZour&*Kj`VG~(1ufmC;8U%IHK9@_dK5WbQF`L++OPR%d(s8FP6T`qzSqo$c+740Ii>_8x4Hpm`LQ?oO zvi0b4b}4iI)SjVVyWDr}IEL9!gJepIHR1)2Dy;cyVuV9Z5O8cGPgEXP$HJ1X$8=$FyAVVAiT=iOIXxh^(xooK`ZGB^ws#E#N_v?9= z9%}jRFr>Az#Eh-Aj9?BfgFR91USGDD^UNB3LGKpioJtj(=$mF?l!HFA$ro1s z0cSZ#7T`QuXL&)}ueY@_EmN%SM(;76USbUgdKCINZecN+AHIxF4L#99WTd)rX@q45 z1^-}=d{CP|I-$@w+rIwj0eT%Ff1;;H6(Tawi!@Jq*AOn)1L@{9>AN4#D}dq1Y|bbQ zd_ZYeaN6z>3U5VfLgiA>U$!C}g1ZRn=>nEzG}>U3QECI`P*$mH&YHSeRN!Gc#;~N! zrDJl<+49AC`+pEJ5#4$k`7XM4B)~6ZIfsV2@Z!kinF6{IJu8%1l}+INIj0?YS=sKl z>yE`%n1>uf<JEc)TPqk`YHZS}c*W;j_hbrn7Br9o?BSdSGt7YPI&U|ii2q?~s#-aAcPq$i- z;e}SLds_ab!B1|3QCq^3eL`2GYP?e6a54mlpd$&LOV74jqHErtHI`M8#lx5&{V`=y_>3 z%YJptVg{f(*9A{X`f+6SWw0+j(QY8cIi$U8)wYo!lb7e=aJG?n-{IN8{#EvMznEDP zVuch|EFwvfizq6j6T(1ExFIFL(Vv=U)xrt=gk{`#3z26 z2=KnID4;GfXQSR`Je+69W&cFug|$G7VsPc7i=1$}$N z0&GeGdq(ZZ|Nnd887{wJOn9zTU?$TG2Yvfbd%;%(Lgjl8!-z5K#lz3{TCo6c5$vgP(e z3Dl!^s(y9ryW8R@n+eg3OK=rC#^QV3QsEa2Q6gleY{AXbW%RsIB1 zwOInSegiUU0w~-U{D?9 z5@SY!8>Y+pcNTKBX1Jd8ltA$&5Vi2FMd+~QbXjFcD&k0`>qPi_+ZQrgx*1lPUfemP zw9LW2NJP8N-lN7aAkA1o;ta@owyd0u%_?nSM|Gy#g$?gbl!S(z9B}tL72o63ZBZYz zZx!vR-rh|#fBs&rrMaahe5q^TXOkwBw9v44rPnSeDAT}M zx-6Zf>cTIK7Kjw{rfTMreS8qEz|T-H$kf8VQJl$n~gLW zdn0=Dkz|7cOZoL!W0YxUBSee1gm>1}5Ulm3Hhf_0feL2Z2ROBedk|BXZx|>VeA;v= zA0~;RYX{FLPfzdnIrJqt#G(q5(Myxr(&Q!wyEEkiV9Uk%c_nJy1UW%jS4&!dX?aUL z?)Mue3$S_!2xpXdRzmvq5IW8|r8}g9xo(<39mH{9Og9BXZPxN-cB-X}C+1(8Uvjw; zsZpLfRHnwP4dlSW*=;9_{k3++FY6QEx7N0TuNhh0g`39OYN2&-g^78Al5$-9sUz97 zJ~onxpYH!0p822E+221}FWEHc`1Ti;2#w!pr*h!b4be$yeQtzMn1L#KT@2T*_Vxdy zHo+T2&^f19E+2Gk-F|<9=k%a zO28GW){s1MPgT}SS54Qh@Sn@$|9>ChQcCQvg|vElou?^jAzm^|4b}J(5RFGvT??`A zs~%J->djAnvU9vjvg7um>H`>xPAdW6gYI3OFO+35+Lq%>@ZV$+{Fkbv+Dtx*u`b=( zZ@fvP*qb)l=gj9Ay2fi=V*UP2pQjfjrHhS=FTcenDFJuzSqEo5{!<|Kxn|X({y~sc zM|yZ&yUFX};5_0thu$4+SbJm3d2>jUzb0XBoU3L z;$$P*0Cg8uBs`C`ozlq`moWdZ#E#x(>&?2imimCG5nzmau8Mdi}g1; zrlVk8c+Y&NmcekgM-4w~7x8GGRw^O6&*?U-6_^!?TfsFnExM*uBhFp{Kj=`@FBM<{ z{wIS^%VN_zesZfI;ACkXYl3T7Ov%WH#63|)`0c-$07R#i<-ZOE*(&lc*5AQPL4aWR zZ^gs!fx(3E3bX));$O>esg#%VV=bfQmj*@Ka?ofK0B;2lW7c3okk*?-a zT#!@!sA9K1HZZ4r7{{YIbr?e26c`?GNsvQ{JlC!*dlWjOlpW()mezn=v6gKp>P^0# z#_t2I(hW1?9_eLz5d*Z11Zd-Xf?57rJN&nk!|afnZOYvs zKw+mn&M%g4X=!OezY4Oh3UIYRq4ggf?gE6$Q%)jpYy4nHTj@?)F_16vlFtJlN5$$b zF%D5yk2*I~#E~#(#5vCNQNNz^0KEe7_hy?<7JSTa4={@%Vd{}eC+1z!3Z{5A)e1pi z@~Note$mnhS=nIR3DtZJbY-;iphLv8Q*L@J>x;m=wJQ6n9ID+*wZe+(uez+Ey#iy__isx;Qj z)G2Jv^ZfFGHE0|rN7jjIsP2#~-kHTOi(r>e><0>;^M82X0eX3GGN=^28T-;ke}kF- zGUJmTxhke*aRNA_0omDA9d0n7=sk9tGO=DLVU zTO`$$xgfA;toQBl^?7Ba)l8Y$%=3&pz5ORqX}Q>pkEvf(!6TkulHpIa=>XNc5GJ^* z0HTYchz*6+R%*%Xn{SZYv%&@5}tvV%PK`j1PS00w|_(d@?U*cv;j?L3tJ zYWX^e>X+cqy)WGwEMs3_h&k07W9(Rom5TDL8!MXQC zDC#3qziS`@dB$8?MbS4o@{R)EM%6ZRjvAS3Z#6?ih$`IApqSlCvVM=xf8U0Ii{B7c zzEr5LYpRVv6B(15hB^sGEl}119Yz3m>AR$eA>%#+(>7_DJGlWBi!0KICZ@AuV>sac zMp?6HnfE}PFL#l&nAjPcF0*NRa3`6k>!Rdd78;0Hz6O>ofmf77yNT`%bQY#0Zf5tVQSPh@oGJy4e`Z(~D%{8JYk{H-mCzOPAHRebv0jNE z=~Yy0w)&W1y(Cdu!(nNF-_YqYY8pD?GUXi*GRMux$j0e%dNFobVi zP;JLN%3@ok_|b~#rAXWHYGcLHc~)vE(T=ZiE@&f&{HL8UXHy$sfMsB%zFpugqY4*fPwBrisxuT9^$dUI5sbf2uZH)Au)7*L2Rn5+Xj*i<;OI( zCDLTO5ycF{BpKKc7#FAr4HkL4Wq|FhXX@>7NNy`U>zi6x?b5i)8-8-E<$?2Ss$4#U ze*P4DKU~o>Aw7NMgmv5=hT%WO!tOoX;sZOl+eg{xR_mc}7ryR)=I@00IXkgMxHn?i zb_RA;;wG1tSYh%4WA)pMiqlM(Eev~e2jw(rKJ8MR$RDnW`#`8DvOZo&g|WOMJpeTY zt=9T8y{ACWSy<(0*j)%kWE+ev{!+zkLpKfSG}~TPhDdJh)Impeehb=-V=yG4IDKK-Ak74W!`` zQ3}*|@gu}aLkL*CEZi~CuWmhxOpNSeOYNXLB^D))mpq59*L+huS_wXOP0V%02q+I=ziAI)V^$~(UDTO+ty7q@3aAIqXi`%5|onF+?E{rMV-9uJ8^WEYh*iN z|Mj2|YMtuo2*f84acFv>@m}Y|s-c>`>IBUp#t#X0{7U481Xn zVy%PP+nVa{t@*C~QgLXDbp%xihxg386BzzJ^Lb6v>*vI&R7WIH=25oSjyP2JWPYt+Yd)Y;0R{ zqJdnO!k>eP_p!Z(%R^}IAyNuJu)=`C3poS^faN5LI9uhWlKFu*PdmIQyGr^yAJ%$` zhj9Gz-o8SP$>9FCf>u_ha)`UJWin?M6vv8kKOWV-Eg$+8Lb96^LYx2ziaRV{folR3 znitYYWJEBp2;r|~2BoQT=&It(F^j1++Z42?`n3Zbm=G=lz#;c(hXNfHmW`SOH3KZXNX%Z)Yn@}C;@P!? z<`}r}FvW4FrA%TM8g0dVv6YI-0ynhA%N|CamFeQ;u_3rQ%_GcU8DMSc%2ExVDBS?1 z$q=U2t={!i`Hp@1?J)biPAGOTjQ#7h`zI>nyS7fYfhVHel$%eP-Kla2EL?K5sz|W} zJuM`Ww4tmF??P9HKIX)|BT2Nd<|4eLVA1hm4RuNT`ENM5ncA zlbvP*=K|qxlN1`GsRS+N@x77@aPRoW&?I_@FM+^>#Tza*l+l@v-s=39G|~N-AiODICz$=EHv)|v-Q3%YP@wRRRqD>y_WAfloyXLDw7|!CODbJ>9*IX+s@Mf zNB`=Y{Lijif2I&kheex=t_2uAQPZ~(_{^a34_`Q1^;G;2F;m_UZ?kWz5bq*T;eVv3 z4y=4CsO=-M$rdTUwxV#sIVH4k`nv72_P^n?|1+j#MbNnyKA(5!X)h^cLk*o(&I@|ZT| zm=l#@mDGukvlVsWVf3;QCH~8O+CM=N{EhbX&zi-QBK$2UkrqKNY~cmEchB^3L>JCk z9%VCfq)nz_C#j8YO_e2-AW$swgJ$cj6VmqtL;Xq|pkmZo1|<(Czn+#vuOLPwCW;27 zU?9o|$z~OT!fc|YVi7s#8~ecGE+b|-0&4MK4)y%Uu9$lr$--NE$*BtRO~ommMY08S z9!Nt80nv~Ey@bFJ3Uv?6;PP)w=)eCgbNp=q4-6iN&1(y)4?6moGQ1ejp>@&epSnYD zz2)DAFrUS36z&R(sPT-BS#+f(hesqMmwK=#*NpqTvwcM`<=QTdtqbviWXiVrl;L<8kdo!6&F z6H9lirVG146-upMkDY#wZi~E=cD^U^UwurV(DK;G%xv~!FErvu3xOqBm&BP_o!Hym z&Fz{!uz|Pns(R?N7xJQNQ=`nsuR1UCB^E!0s-+;wa`_VV1N;hz`uoqZDX0|XrbW~W zjP}wBi?f?!5?$aDMz_nPG}oFbbkE=;$av>;R~HUET~>6By1ndQc`6}>zHz|ECUtg} zqzQ*XzWy6Z(LeqBpW6dbHfUn92qvxY`g!gLUS1Xk0O@-S3fF>%9ztVk;pUshxtal! zXDHX=k6JVfG>*RkiJ_;*+4itp0Syh?ZveZ~W9@#-x%T|6Yd2>uB z=39i^cmM*Rd*a;&X|ilfJC&zQ}*+aI&7 z=j*bAwi*-jXgPG$^J7A}KkV6mKkA^f&BCiXN6y`_AEHpt)}q-XR>DwrLFAYgc9B*lzHN` z;?d+Br;i-p?$xOV$3gKXwXB=BOuQ0Q{b6QD!|+-NV>M=11rV3Z|99_(*jt0?> zyR;JDMi_;02lL}4=H@(No_n_D_FHTinLNCj>(Sxk3&-D!=eA7=AmE=NC|9lOWCeOI^~rH8an9DRg)ntooYP89=&O1k-IJKIGmG_TTShx?Yn& zW8+F~K-hWT7#6{(aa&+XI*UTgw>3+TmX6SMD9qh$WaBrwP>!>dA)*FKb1RnUjbh8xrkl9Ji5{Njy z6-aHJLt94}5h8!^##7rcm)UudD<_xQR_z`U>sx1E54$maI~}Q7#~_B{&`sC#Oi=BS zXEjoXHL4rfD> z{_*8e0`t@MQX5siDsni{;LqEao4seYo?@i))-yb%2F(o3b!jg=uEV^rwwVRLzXe!P z1hpqYh6P={Z%odLB2TVxsJ!3HgQ4rD2AMBq(gb@+o^}RIEhOua4YyJ#v4aunnqj*N zQJFF}r?rTE5cSp6Dg~_+g8bmu3Kfl*wawiAPZ);*D#{I=wMpOWTRy%DzfNo>g)c%$ zOpwzbjM!EDA!16swPtFL^C{Vj8F=3Hl-oh&?aB|os@IQc-p+z@ zIze31!Zy2L)-tKDr50eRCPS{R2z9L*G(?1)f4WfoWs&&?i<{IwPd$%67}7Xiz-UUf zJ|DhcTs8)-w7L!WzAw|sybaly45YQ?=hvBoq#2DJ9&nHP@o>U1Ewne_rK$E=jEqE0 zuxG-G8GtB$^m!0o;3U+jFcX-D9UtIDrwGI~GAc@T*p_2KrBA2G6GqjDAY*bdwk>~D zJm3BFt6d7Geu%Y#^>N?9y;ajPZo##tE)EUCnQRy#@M2tE9RViqZJ5w_oAi+28XUIN zdDf1HZ^p=8@J+ga@e~jQ0U6dPR9HU9r4Lee`dL7V{!kDHB$t#da{G{P3nr7nva7Y_#2Au!Ao}d#0jro)=@U zVc5-rG8_+&ln%iz$LhnEd$<@*+6$d25#BRvow+NJxXC$EL%Nl!&h6Jd3+j?b)by)7d(Lty~)04b<6Bi?v+8ZW|K6c8^lU*?g`cVe66gx7WoR7(gI_>qvYX%F^B z%mVC`(R}pOx4y2+Vw`cRV>V!wSXf+EL5!zb6TWV9lK+z-@721?m0Pj?;~c*c%Glj_5{+^XFgX01yU=fSWdwS zOj4y7i$?!5xDpwS8TV4EmoOLZgU#`VHJg^GGvEXfxB>VU+x zr^;k9z=Z`aKxF3wV*IMMJh~EwQCd@r5}-4M)SsRBCxNeK?6jd08%R7QHbb0cJ$FJI zHOz4Bje}|8&UP2Lr#@cV>}7f;>x=_YI4>QfX8iHugFIyBtpc0agcDYsnj%Y$LN-0c zxTEYgx{*})GFVWHmPY0fh9X%%7CgQ3yJacYq4S@J?SuC$J{Svj&I4|S7|U(MLz(5P zy&BI}vR@TGkYbb!N6_}mP7}tSzyWbR16I(+?T18QLm2P&VB|)vdU|Wtm^qSQiK3EC zZ#xkUu`MhQ(F(K6Y$APli|D!&=MghnRR2Nq+_>_o@>Kx=#i*{@@l>HLv8?daz&2(P zpqL694nA0<1_tWQ5;}iXVoh;)V|P4!(B)U0EF1UD_HN#|rS^g?$#LwB+R z+Z*Pu!5AdU@YSnt!UWqjl!I8i!Mv9wbZ&&PiBeY1{)2)jOkzpX{>-^H5Z{3Eh*U(E z{T1teST<{`>O!aIM2;d*`)rzl8H{U|;2`p9nSZ#&bql0DqkhU?as60MnONxieMh*HEte7W z0JFwekxg%ETAh#^UyEZ~lj(+m?vH~pcy0>{ZPN@jZU@Fsdv(BuN{*}*FTtieqRp1> z@3_q7ce)UF$q*QtBD@_{+wcdOP%K$uV0}BL-njJ;L{Hl2SD_T2(L~fnLVm#Z@o>o> zUhg)UqKZZ#H$CaB|H%7k&%E9)b=nJsR63`Sc+J%a*TqnM3B|CIWLtHvRGYPuN3oT0 z+4tBqqR?8x2{Ul~P!X<+e;Qm?dzrh2-G7xHi^ZjbN4NWeNGD!nZqV&vT zimhn_rkX;MX22*VL@H9VuzcjBXrbMTO49YAe1(7cVuO>!A}sY`P1cozh0dIEF5V}O z;Vt{Rv!2%mY1WW}QCC6Gy!EpKWrZw<0mjfU87xKcFBFK32zx@8K?IMEXI_yTX~G7p z$9`MurZsm7;MNeXr?KWPdPTgfNEB+c;uY`84U~`W7$DgVC2b|CRja6`s5eohb*>6E z2we(z;~QLW-s*Wi)l0 zkpnrf@w~D>i}7+G*$p|wG+o^cUt}7nZbq*2S=RGg1F|Obdl>-dmc<^-gQ*hB8_Ra@ zXQcg1aoaIQb@H|FnlzW0MdPg%ODh>AcfNJTCfM0z?}>SRHQXH}d;L7|cI?M5b!pEV z68wqXyaYpm#4mgs$ZqrJywj41d=2LYi~W=wZ$jwec+rb7L zPyPCZmU@a3E!tDT5{PZ_)H{0|O+9Ogqm+I!72$b554aEbe6?cRe?7DvegRDZlU$Y- zzhGu0piOA#|6>slIj}Z&Kk&s&Rz&GPPHAW^WY8S4L{}`(Id!$m+x=Bbvo+eDg*D{0FP{(z{i2%0t**DM%udMK)`m z#P>(hDQg}?;AVsmvt=+?3ewBD_WDpii|RZgWMkA#>|=?Xd&n|}TAsP;3kT+4d1`aZ zkoYg!+b}{d&Lpq>GT+6eQf$!-8~MRhB9xP|Y<=M@ADR9$=a8V_6%SmqjL+RIK24HCF?Wc~$A`{!H{p(LI zZpFiRe87ZM`GU2f1%J3Ye9ai^XutvEx9dL#n`y%=)Fu1>4L9;n?^%^D7EWHG`7?tZ ziVE*0srnpTW64B3?u;VEI+`j8ZXecJ{WJoEZe$|$W?8pANwl5iFJ&mBb7T)u$EP)$ zfSArg#KP?`e(aej^O|fy&^HB_awC1)w15Mi8|`~Fo9(3snxjSKy)d<*{1S@VI^Wo= zlAK4evz3rS+c}s0N#o%n>!bOUDmmbOD!F7NlrzFrmNUNG;~$R;xPK%PYo79C@~bx0 zH?_|kvsu>#z(C^Ytx^jpjPfHH#e&oyreVhjQ^I2WA~Z$beB&Dv@3QWWtv(M^^7FI4 zG$;Wc#j2n;!;lo-yE_aqI_LZm?7W#8${qD>A!1`J)Rr23DkpP0WZ9!uboTt<<|Rt1 zXyotJqJO7s=&)}DVTnA$rOQpe@2F|2ts6+cW*!Ld9rzN9v70P_?nQk!M2j~I>p3!c z>#XBMx5?y;&`*PS=IfhWXs*lfyDsk%g9lw7Z)3i4!M7@SZLO7D2Vp!?P5O)axdr*S zd6gTjEPkn!{fe(!WWG-))|0>M3(qwcukKU%2Ie*VXa4ekQL)<8Pa{VV4{R{bpan2BZ;ajY_k{jsZPGUzknRP`uL?{ zP4i4ELXt^VvRk`bz)HX@R|a5lYw@mann%Mvh=;(4clk%}FnqG?r8QVG9GKV}$K^7^ zQ1Y+J=5c>kKIxs^Cdp*K5t_Dxd{D$5z8?Gx)YIY2POai>GNq1;v3{99S2uASzybZ( zmP)K!@iL)etw!eMPBs zk(lEes_ys%YqPo6Zvg7#OdE!qS@A9Gj5}b~qD*5PeMp`&6=XSQB;-{;ijTu*>tVXU zB}TyBi*vRQZr`Zbu$+`qRI!IZ`1|ClqnW*?fNzGs8gP4@0c^ z7p_J2j44i#?>I=(_^76`nw!~^Z5U4tk0IBch3@dxTIwj6KdmM3@i{Dlw?D#3ZB^4^f|;o|2k`Xt zTaEY0lk3;KAhzXoFJgi(7@abRhFAbr%jTDFXX`=a_k*gc*E-r$ z92Z+Mv2)t7UoL7(Ih6W{(9uNZs0-EtG>i-e=dM~Q7P5zl6EoQzi1%vxS)!)y@<(b& zEyi~?Q4rx}01k%#?)enVDUD}^)fQ~oq7iwy3&U1K{Vg)cG5aVf+;4XAK7~YJ;R%jo zpQMuUo^<>_?VWiv6zm?yr-UTSOBhV1?1>R$-}9oG#*)f9lwFJ=vQ09!#MsIZjU{AX z!wkkdHDpV&PK=CHQoQxznvo@g?%Z?ky??#;+}k;w``7#LbI$Yp@jK6X&hvbp@9+DS zIvauFe!x7J`o1N_&RsufzBAQ)JoO2&^QaG0n3K5qVgcv|h>)^v32Cr2?9S<*I;&vce!k z^-cT?06hJFdXfLv|CHFng2jAAGDALU4PIMe6`*FnWpqs^tULDI1}U3})9%@YapM3A5;)LeO6HiS2#;WG*%3We{(wAN(4~v{~?lryt^gPwe z?f7#*^RrPi&NIWQxs5GYjP~Eets14`)eYviUMP<9ECnxz;$!~#qx4_Y$_;Vbh!g2s zy1zwidZ-NliZia`Z?(rVCS1=#X| zGNtrBU2!_?cY~ofLpg9&S>>71EIFAay)bzo;qWh*+bj*Tm~;t;(~a&=a5$akKR6xf zThdDwlY(6>3?04c)%lGiBy3XdTc`Ncdvja%p~QOlz_rNpo_ZFdry)l7u>QC5gOdPG zcf0X3cGY9m(baAp`=g{RI(@3rill;T#tm$eSSGB$2?+a8V z5*Ot_3DSn(fF%?F_Vr|L{Lm0-iD^la;8?{ya;KJXJy?y<;@gY{fzEg#F-M~P98^*w z7p1}ISpIXY{kKm1y+pdmiRGQkQlaa4^A^bF@iSPBX&U zv61|-2Uk}X$t`T2Z}Mqwg!%Q1BGn9<&U;EO^#I_2nFMc!ESu+X4K=&=T=i#M_*nQ` zxJxFb)!-41e@XfEn>xKOdW|+vM;pWlZl#*|B!vfsD5m;&65FVbzP>>qO}r-1Kdc#4 z@)L6a-V5Ng*}x5Q_#>JYi?bV@l<~`&hY1w@Oc&{!UbAGs<`lcXvS0q@d zsl4_@_2>0KUx*)aiNuW^F;or^%T0KCDT8I!WV`htxZL~w+f zc{WZi$HeEnltuL5Vdv&IFC^WPH#l00oQYj0?eBo}Rjq)LM`T2`cx|?EU9YB9m0M1> zCOO-7l7o`s4C}$quqITdF3yvfznsWeutqUOdRv_ZJsFt}(r3`xx@ip4}w zjvLxPXyw)f@K@vEjMNlQkCwdG1dTF7vr+xVBLEiJDE0%Y5ZGq1KY)O*9zq?u*Q%w= zh?|X)boI!%aGZRS3;n5__w*!ky9w3B<@$%?A`O{dnA$M)N=-O>)sni)HRx!VR)6oh z`CzYz3@7J;$>wyXQ%}4yGExurp=C}o9I9#oNJSV^#n#$-EUdKM+ZwGfIM=T8Y<3(B z{8F-(6Da)rqg(7hZ2+(`(j_&pS3GCNhui+E@Wm z^Jh)kT<`mvM;Ap6VNy-F>Htex?b+WnkRWg_0gezPw*G6 zl619cc*Dmf&~7@fI_+~8*-_%$LPW$6Ke`Vl3PGSz7-6`gFJ9}z%$XG#!N}N6p)2?D zbeLbDR~w>#D^Q-#-U1VZJMFWIB*E*OVqWnYUu~RLu1F-=7um99(W??80qr7H7OY83 z8awLbW$F95$&&q;8Rq`uh{}MS+JcF4A`_e)w)y&(rv)=wMCjqg`_6Uw9%8c(rZaod zH!T?4MLVUFSI{uq<5#@`o%qC^fO|Bub?D%1(&&vGj#hqQM~1rPN`K6o74hP0J-iZ8 z5A>y?M6%Y*Sx92N9QKnh&U}~hk|O2@R!m9Vg-5$AQ>Sia&-OKfG&M+t7x!qe($+c{ z@%*xPmR`@T@lo5gDyfYlW0POiz(`hC8>wFUJ3z_gDr1G_P8bL!gZ#OwOG^cXZ0S$V zalcL0>c9sw8=tamRC+%Z{t-cLKWoqr9y(G_teRVdbq`3nLE`T0`q1|UDTrdM>55q% zRE*r|0+DOial9L Date: Sat, 1 Jun 2024 13:08:52 +0200 Subject: [PATCH 14/33] [susy2/ps2pdf] in CODEGEN/generateAndCompare.sh, remove card.jpg, diagrams.html and matrix*.jpg files (created if ghostscript is installed) --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 71f7719047..47a36dd1d5 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -318,6 +318,10 @@ function codeGenAndDiff() # Note: treatcards run also regenerates vector.inc if vector_size has changed in the runcard ${outproc}/bin/madevent treatcards run >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") ${outproc}/bin/madevent treatcards param >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") + # Remove card.jpg, diagrams.html and matrix*.jpg files (NB: these are only created if ghostscript is installed) + \rm -f ${outproc}/SubProcesses/P*/card.jpg + \rm -f ${outproc}/SubProcesses/P*/diagrams.html + \rm -f ${outproc}/SubProcesses/P*/matrix*jpg # Cleanup \rm -f ${outproc}/crossx.html \rm -f ${outproc}/index.html From 1104579c0e0e5bfe87a36107ed9ef99bd95a3819 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 1 Jun 2024 13:09:59 +0200 Subject: [PATCH 15/33] [susy2/ps2pdf] regenerate susy_gg_t1t1.mad with new script that removes three jpg/html files generated if ghostscript is installed --- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 14 +++++++------- .../SubProcesses/P1_gg_t1t1x/card.jpg | Bin 6238 -> 0 bytes .../SubProcesses/P1_gg_t1t1x/diagrams.html | 10 ---------- .../SubProcesses/P1_gg_t1t1x/matrix11.jpg | Bin 43691 -> 0 bytes 4 files changed, 7 insertions(+), 17 deletions(-) delete mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/card.jpg delete mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html delete mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix11.jpg diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index ea2011c29e..381ba3a551 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.131 s +1 processes with 6 diagrams generated in 0.128 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -598,13 +598,13 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.192 s +ALOHA: aloha creates 3 routines in 0.194 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.191 s +ALOHA: aloha creates 6 routines in 0.189 s VVV1 VSS1 VSS1 @@ -646,9 +646,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.107s -user 0m2.794s -sys 0m0.312s +real 0m3.122s +user 0m2.790s +sys 0m0.322s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/card.jpg b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/card.jpg deleted file mode 100644 index 1d72b64b3c4f4560608bc8fd8f140733a9e2e29d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6238 zcmbVQ2UL^Wvfh6hNl1V|=q-={p$egerZjWU{~6d+nK(&56xvz-4a2FaZz* z0W;PIY|dgy3?rj`*0xqAj2+uq698bj083!3XHalB)7I3GwA;y<^uzX3+&#lW4Xmwq z{CxlSadix^A_4GF@!#kEPYc$|JIs^i^_8VZJVTk`06KG^(`V~FqWFWax6zic7J0MI`H0MGbk^EeMcO9B7_lfP_~O8{`j1JFF|8NodG zXCzj-06+pP-~>b<2*dyxP=O+#0ZpI-^ucyu4y=G3Z~}XQC-4J7fC(bO5fBR!K?=wK z*&q)Tf)Y>;YCt_`0qvjz+yR5&J{Sd$!8Di$ufZ}{1z%VPk`2N`JdhwH4oN}s5DlV3 zTcPdH4#*aAg4`fqC>V->jzA}%6etVIgNmSXs19m_I-q{&0rWdG11&=Dq0cZl3NRgSoRt0N@b-)H;qp)e%YuG0^fV0E-;1X~} zI2~>Tw}Lyvec(*^F?cdO2VM-XfnSIB!XLt?;cws@2o!<`K|&}ZbP#3;dxR(A0OA-T z6_JN1Lo_405f2eFh-JigBpxY(lt*eK8OYs8Ut}aQ5t)N5MK&ROkfX?XPvO@&csP_e7#v<4$2syjnmI-|-f?0$r8o^Z_i`TM z%;9X{9Oit(h2fIsGU9UQI>wdH)y6f>^@+ekP$5_mf(WUE3PL~OB{v(lH1~FHFYb8m z%iLYub3AY!GLI3DC(j9<5}qEOc_NA^Lu3&56H|!Q#3AA`FE_6muLExsZ$9r0-f2EK zpEMtXFOcsvUp?O#-)DYNegl4Q{v`ey{t^B)0bv1s0dIk1fm(r4fzN^@!R>+pf|-IX zf=`7ILUKYjLXkoRLOnungo(mB!k)s(!u7(FA}|p-5nGYNB9}$(ihL3k6EzhL6+JK7 zCHh8;PfTBIzt~x^n_`RNJmOo${l(9U-x7aC;w2f70!g`~Zql-Zh=iF$ghY|Vki-{B z8A*G|ILTVcr(`UdPWB<6C3ljSDdH4M$`MK>Wl{ zjDQS7=8#OK%o8e}x{Z2(T1pm9#pJUoKfOa+MyJu)S~om3uTM*mdq`^TfQr+D+el$q0C)`qr*cDQ!E_L7dGPJm8@&P!bxU2okI-8nr;J$Jn# zz3Hu_t!`T{ZJpjGvCVy3@wQogvc8voss4h2tigVRDuZ{1%7#qCCc`x&ZKI<`w~gV( zrpC#}L)*Ex+i%a?K4l_l;%icAvTUkudf4=~8Pd$cEYob9AjT!m*`RE!ZSrki*s9nb zvF+W(vuoe3ie0OAhIXgz9@|sxBkVgJI2~LZ${p4mjUBTbr+07JeRTJrlaQ0YQ>!zZ zvx9S~^QS%A_hj#xcTsmqbQyD%agB2A-z&5?aPN(MocrAOHMk+&9NjA1zPekv7rTG* zV0c{cSoSpXJm>k^OW*6P*P^$ccb4~pkDgDK&r4rD-!s08e%t(V{NDH*`RDns>^IwA zxPL9cD&T6sW}stWZ4f%hEvPM+J2)V?J47rbD&#?^d}w0m^a1Sy*$0-H7R=HxD9kmi zHJr#QigzQZ5eX492e%%~Ke!(05ZMq#hzgFn8!Z=|6uofB3g(d*(!fqp?vp=e=B;pa>KmnMo#i<*n2i*qicE+4%7qQs%3_X_<=d8tro zO6j+&!B?ltY|FaKHOtE@L@G{K!YU&w7pq*V?pGUEx7H}u6xH(8rd$KpBCfrvbE_Mx zx2W%Ipf}VsN;O_+;%Q25hMJ?Bms|W=rdyp_AGDdbbzaxG-t?Q&Z&%wT+VgMl-Z*^| zdo%v#=B-1wKHUzz{iegWW3F>w=i{#3U5~nLx`%o!diw5|-0A8y=)Kja*Vo>!-QPAq zA7~lW7;L_)ez)nK`n{$hjiKgY&EeMjTKBI%(0y=oWZOu`L*s{c9x)ybj_w?NFlIkC zG43)xJ>fO+^7p{sS0)cmet8`81oYr_vGbGlr>WJT)h}zw>w@cL8}yCd&yJrLzC?ay`%rY0_{2m+E@Ld5{Z zW8=;yd4NcZ%`8Hb4eL61Z67RCR6Uq+7%U&ZfS{0+v=bXzupI3ajIQJfyE1kK)?}fKX^f~$RETB2&9Axirc`3&HVt6BrO(AG|Vij>%fpzZI^jH zm=Cag6tx+tl^@jpVD|Tj#r?mS{gc=)UXuU^hgh2jCxC5W?K8DF2K%3?{1(g6$xUGQ zsA}(!ndOA<4Up9e!~7O;!8`;hoiC< zYY-~+B>5Ye7^$v0idT+8Pi_MjM^-ZR5{#uB+j?&DwYS}0s{TsmP0udhPdoWrd(yp< z>a@GUchfM6?KMm-+c{FR$==u%X<|g?r#9+J%^wHya%jbvLuV=D^xRX<9$>kWY5S#OplQn^#L+dc%A} z^JlZQmA|{4$S`=$Wza!{nE#;igU&CN(^^Ozo7$6-n=;nhFX100*`lZS`j7aaU>#Xo=O;(V7_QRJDXs2NAB4&?AJn=kt+&$YC{ zZOpNptUq-6OWR&)T)L>4mqaY?(sRN{`z`wiO|>UmT7<^Vmz(5;j|*S<5UWu~-rAj1 z>_2K1PX@qAt(6Zk_ASVQSPO+xMDb6W8kCI;$-ksuU0aSa-uWzsheJQBKJ5vi#uIZIb~ zJD1&c=i&K_v-U|Tu)YofowQqF#$a{|jqN*PS$4%+?oVetklYy^%AtHV@o3P{-yxafUgtQSu zuZ!dqd{yGgIo!Pu8f?NB`>bI#vqY%Jqn1G9%~bQdglZ@H*fmRo&kk8Z@yVN zmzqsct|kT9LQ&X9v3ixGIrcg2LZm}|bjD46Df0{iiR`n}l`yoNJ}C~3usf8K?OFM_ zihuRN(5YFo!O#bE1e=M-<0{(7C4g#g#&4y*VAWi{aQRtE;}T&b!*I&=qZW5pB6Xs7n3$lSLle`$-Re^C zN|-{16M>E0QUo=yfPS!F{YgLBCQvDdcBx+!(j zEylQ<+G~~BOt`A(J5BdvF8E9D6d)l+WW{w|lO?OhwAmaj<=A(zu6Iuh7f~;iX+4%U z!cjqUJeN^G2=9${n=XVIH+{~!A{hlhC=l@|`8(HY*+PNvp6FmDsdNEJ!}km_($1R3 zkcj~cYS6+l%6jNhS?|%JLOHclM))dPQ+v)xot$sFcVO+iz7!v9ys?fK!IW^i>o~Xb z_g8CO<2)u2qUs_p`^QxjY>2y4Fa^BKJ|moVo&F8M%LjK?6g12|_3hF5xCxMtk7OBp zr8|1j_R8q?HhcSBrlY-sS`{VZ&X|;aGfmto?i3YWloe3E^rGRM2Db$zrh_YGVD|*S z#4;{Ely^sMowbNH8^gP_%j>=6bwM>;n%^cKamvf+Yv~-Q$(PdEblA-l6+MUg+)_Io?QbsWO$$j$%fd&@6j ze%`@<9slQ&=)HcjM`&BZo1QzBM-g}#F2nc5y7w}ae>)}W#AIOG2DxYf_g*xtY&oqy z=%=1SMOJZNYio$&efx1?-&D7Ys?V&61ldEwW1Fhq@7m`23IPS`W4gN{#Ik29gB%7% z7+%c<^Q5SEkr)S(naZs*>^?Rxn#Xn#M%cOR9a0GYmYrr;{832eWaCrVGwSg3F?GUMABTNOd)iRNiL72CdbXq>K>%Vs%{5L1o)Z_BF`A&Wqhra_6a2WDhroGVrG|b zR9Qquf4dwOk`U)wD`ZkEl`5J>J|PX~R9!*_o!Ad9!8yD3qQ|e^D%b?wE%lhEJvW{l z^SnNFC#j8+y3yMw85wC?WcL1kao0KgB9{Tay4_)Q#S}*%na!mcR!{{1bQ>H<&iYmN zf9C(^SbNl)ugF)bYFOK3y+1B8;SO3?o2%=p=kU(#y$d#$(x`lg);MANsii}2u61!u zA^ds@bxygd-p}3^Dc4lA9T4q8JFiI?)3{s<7cwu*=hn?9(fg;v;lK^Bir3MWQ*UQR tg}X;g*24B~WD59vkR)uV;7s3_+i{3vO-!{EcCNDe&A;>|e;055{$CII!xaDk diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html deleted file mode 100644 index 58a29e1a9b..0000000000 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.html +++ /dev/null @@ -1,10 +0,0 @@ - - -Feynman Diagrams - - -

Postscript Diagrams for g g > t1 t1~ WEIGHTED<=2 @1
-Page 1 of 2
- - - diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix11.jpg b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix11.jpg deleted file mode 100644 index 672916a2d92f7765449bcddeb317d31c8eebf9a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 43691 zcmeFYbyQr-wm!O=ZfM-yrIFw+!IR)2NYJ3c3GN;&5L^QUm*5)Q3GNQT-61%^0wJ&W zJ@?4(?z7L?=Z*Wv8*ltpk78BNRaCE8tE%QV=ls^MxnFAlw!E~QGynpD09p74__cvb zE%)@PfwHQSwA^zk_yqt!#RlL90{pLyt+S)5%oA!Y?N`*l$^Lr0Hg>X?R91fe*YCeS zU(EpUo&aEq?f2LItpwG?)X5m$>KtA|;4;KI+0;m8MfDe!WlmH#T1h4_zfB+y2NB~lRJfH-q0os5*U<_CQ zHh?4G26zMhKrj#v!~h9E3Xlop0!2VMPzBTjEkFm*3k(6{z*k@qSOd0!Z@?*V1>C_W zBou@J!UK_js6g}}77#Z`5F`eY0zC(*g0w+~Aajr{$OYsL3Iv6N;y@{&Y)}!X64VIl z01bd9K=Yt=&_3uK^aBBmfR2EVK!L!3z=0r$Ac>%WpoXA_V2KQ$dkys$ag5% zDD)_gQIt?jP&`ngQF2k5P{vTUQSMN2P#IB0QPoi2p!%bJL@h_{L0v+IZP8wU(8g@I?PGTV=NRbdMrsSeJoF`M67D8F|0#u6l@0Ur`WHt-(sg? zH)79XU*h25aN#K7*x*Fq6yprw?BPOj8E~a=O>p1ge!}g--NXaq(cwMCGsX+V%f;)# z+r~%2XT+Doe}f;6Uxq)5e@cK$z)PS(;7*WA&`PjI2qt78lq0kzj3%rioFV){L_s7; zWJ(lDR8BNWbWKb~EJ18a97bG0{FV3z2{nlni4{pKNj=FjDTI`nRE5-yG=sE<^pFgn zOqk4=ER3v*Y=InsoS9se+>87Z`7rq<1tovzRCrWk zRF+g9sJ>7gP!mu~Qd?7hr0%9Zr6H%0rE#IjrWvKVqh+L3rwyR3pk1azr4ytxqx(SD zNp}jPf+@nhVI{BydK7v=dJFm_`ab$=21W)=hW8Bh410{kjB<>gj3taqOc+e!Ob$#r zOw*5$9tl3OdX)ZXoEd?ckJ*Abm3fQ>frX#Nk|mvGf)$Zfh}D)ghjo?>olTO>m92zr zot=>VIeP$mJ^L{SjN=tYEJq*511BG+HD@m8A{Q=~JXZi$6W2L6Gq*8!D)%%G29Gq4 zFHZx{87~X3DQ^bv93LK^BHw$yPQIV~0{o8r<^1~sFabk>G=W(`d_iTw2*G|KM4=}_ zzCtZRw~zTBJ3p>^d?L&yY$aSQyeGmaVk(j=vMEX{`dTzwbWMy}%up;#Y)za-{Iz(F z_=W_Xgo#AH#IEEc$v2XvlE+UtpEy3LeRBQu@l&6t?a#o^WS)gR8;!A{uHM78~BbR(hTD`qD_wD9z~9_^EM{@i!9*lMg2Q zrlO{Crn_dMW^rbF=A!2D=KB`n7Ks*zmQO8HEYIG^zR7%ZZKY&YVD-yd)4IY2*~ZYO z(H7U%%C_5%($3v(!k)!G*nZ97u|tBxsiT5pz7xnv&#A!~A8r(fU6@>gT-II1TvJ_d z+%(*(-ErJ)-G@DxJwiQpJ*7SKydYl2UY*`F-u~X}K9WA!Z$WR3-gf%Z`M&er_LK1| z^hfo#@*fG{42TQ(9;g}E^p4`4-@DBqnV^zj%wWgh+4sWlGeaODmLa2|JfX><4`C)@ zgW;UviQ)GV#u0;&T#+9me?^%^jYjiFXT%`I*v8DnipLhl;lz2yZN@9c*L|S=5cc7F zf-nErd_9-q)%o@WRz!8WQJ${ z$g<3u%a+Nm%VEe#{Dkz$_0vwSMs8o8U|vx^X?}SA&jQ7lPLzD~@j&&$2En_by zu8^HYBoorCRf z_TTOfgO6~I(vF#qYfdCihEKIlx6d5Uew{~LkX#ge=l$M!seHL|Wpj0R9d<)uFc<*>Mnr_b@eIfZejR||AmY++Nh0B?yoS;`;B))OW+T%*sq7$7oj8T@ z7&!)@pb`=hlaSIgFfu)2=H=rT5EOd+^qG{jjI5meOEq;3O*n8eHZe6bx3GNUOWG#H5eODLJ2V^YRM{i;AnNYijH28ycHBySjUN`}zk4C#R;r z&dkouFRX8DZf)=E?(H9(onL&vyt=-*{qdV#AOQRqwf?5r|IiBut``CX0){|;(+h;) z_M7535JVa-BwR@q=xYZ&T5f-2{3o&5l^rN_JgTPzMvfDxgfQN9`m^6u`!CJ@oMHk0 zuQdCcVt?0b0YC?X;13Us1Be6HKbUg-Q2%fFzhm(K*Ez6r|9XR@MHDk}LU+He$iwSO z+t!8X5kB+lgzH^87&kk9Z1eFp+=eIUZ}hGld&w0-o+^zm_! zptR%YE>qzXCiC(V5PAJeAr#dbT6k&1Yrs|kz+blIx&dYJnqqxe}!IIzuy zk<{L)+hcZyF=ZkAtyK8<*cE6GxVSJTp&H`iSHfZl)TI@;U_chG9DtxGlYgLm;CckJ zf~iXH2pw$$au7cRm3Ejp1Y43ifJ9Ce1!$DQR|fqr7N~fI#X@gU;rxhsu~vtr&D~g) zo|eeL1vnu#Y&dz*PhGa%E;SKZGw{pJNv9g7c%P*r{ISsatgmV zG2P6V*k=31Zwc$ZxNRumC0?yBy9n#ue#~SS{mgMm+2Pgt^qNla`4KVez;t|jpSPda zJG?FzJA$5;$`ME*r4XIr;DNLx#9oqoC$}O-gk!S;+nezWGcUK-u%$NaD7(4F%$V%4 z*o^xaqE--{s}S*E8oH18x;UwYKHpTzg;eu}cUg|Rp1d3yXFqzu83~E1WQQOxXk0}K zANVsxL%()Ik*KQ(?Hg%cUPyYU)b1dD<5Efcfdp%#C_Z@_rWIB91D_4)I`2<1>E<(} z($T`Jndi+m;RK6U?{^2#OXU-Zg_R_DRQ4#Li8JRmGCidA_pqke1Xx|l!2}NLm47sY z*fVQc#AJu`l3r<9r1wF!RJaE!`a>IRNElSO{#K~^Y+WA~xU~uwud4&Fys^0&n^6C8 z?l5JcdX^H+zS8R^H9d3{h+~*5*T)ik=G$6LjTf=WNYX1wPQ`QP)=hI-B2zP|J9qG^ zc`zQ$-7sbO?eK8VmYI3Hcf{bW(Jkv;h@fz*QOlQ2XczBU;>U%$R6CJ2nz~S$`bb7K zxtGI`;MKlC(kGmn6+&yuP>(G4XxXzDiA{;&J`~H@j;YfG%w2`1hNv0uA(+Ub$Yt`k zJRSN1(+{{e0X96!7mW1+=qZWmWPP;V61s9^C9ylvAGuG(lu^c(eq_pjwRRa{Uwt{n zZroVipm-*qPVCDm*-w3rOuZKNAI)tj(Wa=84Ug(nOS*<2Oefq`hu@e?i6jXRA3{~r z5Y0@ZFa2}QoC~z@C)U0FTj3jn{F^RmD;7G-Ph)HcE4oWvCZ$uKwV`7Hc~K^eu#AV= zi~N#L?gi_8Aube3@`{?zD{H4oM;ES?jqjeEm`U~{sME5hArye=VlykhQFUh!PPFJ1 z$h8$Di!Z*mgHN~zTL{6B_@OQG;4y$oYC@$$L_p(AS7(TPT;6{( zjvu`|*92Pzf;0L?9$?A|`+X_)j4CnC&1+s=>H``YqGl+TO%fL0bS8K_w=z1^Wacm z3wBq4N!#f|(PqVwqCk&prr~(@G@TW(_3;-EFMN-Iw1{Z70S#3C&vesBqCLRt8>@~V z(7>bJK*=fNYPltmTw!4$FqpMF4bz^-KmNeVF^mbw`F#QRJ+OGQJxtcx=yWXbuVOFj z{Q~la5NG4th(c$c2is?IF_VGW;?uTfGrTEoJ_!V2^d^<^Ej>`F;Ub0Mam*#>s+8EFy^t%e=hI09F5&-3?>>UWQ? z@fSeop0V1M;@Df=&gXSBZmChV^)r7fI@bR2BcrTJYK$(Ji87)u@lF2b)9O2hP~)Fv z8)T6sHm}JLl)0E0K9~@^!wFbIUo&-zaTjj)Gzj=oV^Z1B*4#90io2|5ptvJPL((e? zxR?p`${EMGcdUydt}Snf+9_1looRf+jeMe@VWv#6(wjM2VGpH3F^?!^Y+^3u7c}CD^&PeRL!*{iAcgXOhp| zbagfbXRnhOw@g#l8znkLM#O($4W)+J>qNF630ez4W_+U_i1yn!CsGFc7c4G@JnNGL zIOkb-4+$kN){2aiQ3e>llzHzL@2xz#aoC`cc{v%cKE-O|f>d4C`hoaiym>yx5lD~? zX9SAc`?pNM2jb9!sW%v854@(U;)b-$ZC zig)x_udPqq*w9)JfN{X$IMt=VusC-B(@9sXqU5*5IqS`{{-t*B#NaYQTlFu%Njcb*oy7l8U+Wc zTC5N@=nNe_?RHyTm?aW$SYBKh&R&}e16LYCe72WlJ|ZNx!@y|=mXwrJ&N9s@qYHe^ ztuArryGA8T{X}G|7GTCW3BdH@CaY7C1R}o59~T)B>}8!HI|qCxNoM>wA$;zp*Z^b^ zY#HJrrYx*vF@bwA157=|4K!rlIp^kYAv96QbQT%Q-7iSgiME&)1@LAC0ZavFU&z}> ziv#ACYe%*Q%p2m%``4;Mw8~1@m5jk8ombR|YyQnk9Hp$6!a}SkN7T)=@nq{6T(tbi zbjSFUMz0!scN9SYmg|-Uu1Uo9!lA*!f9O~mf=s7F~M2FScgjvg9#NN75a;GTo;`Ufj zfeut1@SN~g3rW+EK)!BAIoDP=rR0WP_=GUwETfEXNh2kw2~838^+&cRvLvQkSX}e5 zCYlS#7=E$~G9G_CMgC~|7~jNqE(nCz>>ctAiPtXlF(tnoaX($d##9cRh~Cw~SBCyC z8fGe~*`Ry*8wU2W<4YyFQ79sG<@@j-TkB| zudUX4YrqarNN(fe5FkmR z=30>`B9(3R1^VEUoU$Ja`CRrhy{ZP-*G+_q=b^lu#m}z&ALzPCpG(G0>MA74}Ss8 zQRZsAqNi+qmiz;F|oSdyV`%mzxa&Ljq zmh?$`p9_m#WGFw3shqm>o?hi`v zw+=~#;&QAoKgn#-Jop8my4TB|Wkm2Pl3K$2#FJC5B?LG|q$X6?Ni8AWEJCfy)j!+6z_8E^y?MDI6Hvouu9j$r zEAunM=IHSI-a7%@8s#TbHZLEdK7ve?LEHjR_9ad2UjSPD`tDD3@mqR_UqJ63c$}$1 z{1*_kczW>=Om+WT2y4^%_cwZR+al?(>5H#xK}ppdz$PHwjAONPJT1Nu6HNI`ejZUy z4rc%@3N?{rMCdK`+`VLLX6IXs>Erc`je(tb?O=2?2?}WhS(Q*j`D1;xs@M$q>lFS- z^K8ojbF+*{-eG960|Q`bYKU}822@kZWufqvG50;4Jo-Muy#UXeJI~|GO~_op(&Ch0 zaWCXbKKu)S_ea+IALRt(f3$j9ur}QKlx}{Gi_rc8zK<&E!G4@z`-%C>&73G&*9cE#yZnG%TQ~9w2{)`%-eo+!n}iJ;eINc66uw8^ zBTr-rx?k7JX9o}pt_`Gd?P0;zDC0XaQEMwmtpp4f163;(QfYjUieL;DJ8l+pxZUeR zbd|kplHbO|ei(GM5l(rFnwnI;S`f=d{Gj1@fM#99`i+$Neq^ApzXjL9e_w)JFVTOa z0?FM=@vHN|Zc|jBe1LETnhd_AX@c@?5WTYfVtTG1`sh$Zj2h-<*9xrqj3POac( zdI16>5b@|Z6Zmrue7az{cY5%3tt(-7Y`Z5iwOsSNPNvFz5~A5~io?e!?zxJ*RY^Jg zr*8E>bG82_tN0+s1k_IhWhR-6ATTwh6Dm3tUoTkrDxy-lXsP)_ z>L|BT(njR1Rc<(wUXM0^bi;jslGu(P*FEE@KsRYyw67a`9%6dga={O@9v*^bvQKgR zE{mD;exqcVrT8Bu?NODbSSU9g{b&ELP|!rcxdI2KB)QYc z2W9B!`zxO_7Ry31)rp9Z!y-eIhP%hdA#^XOPEZt>EA^V?RPO0abqwZz0Xjspi1$w+ zv3-`TuCrm2eGXilrIzS^$7(_Ln%e_R^55yq$Ken-_C{fy;5w*^EX+fc4K>$duTmYfgaHnxj2vU;TT!HlJ3(FF-nDW@EvCVPnCkdBdl84L4%YF)H%B@@`I$ z+7Fbi^}u*0mb6ZRX=9+7IanVZq`tlFG=c4A9C(W>3&alGOW#!Ix>K%f3@&fBoB>kj zhwb#nODGt0a0Hv)xa8OhS`u@|zNy&w1%%b2<#H__SlaEDHO5GnV7nNwi}nFnJc2ND zbU9WJVEQKLOhNDLuzsdaLA3ro?IKO>Gf094$RzoF3HN>D0%pemS+w2h;##h%hi7>k zs|y?6%gLNLrmB5jNAUzkYnxsYHx`q?aU|O2vG)dOHLFn3y1H;Tq%yH&iq`Gw=M}iJ z8ay<*2u`g$*CAV*5ylft2(3YpZOH{XTH1Zv3k%&bovEP%R^B&>5w_ z@kS=ribYdgzYTi~%dJiAhX4{SD&=-cN5L(e9;cJC1AUB!7D}4)v3I4`S$ASp=e+eu z&vQ-9RTACIgzK68pt^Am3u;asaBrvX8b^i~H(QP__9;sdF?C=E1P&XiGssHZ%h|T0 zEa)Z92FQ>!Rk@Mr0+4t4=p8(xQcG@BH`ULOLS{YlK30VZdh@=6SxhA8jl;7P(qz|f zzHnU%OYHF~VedEnY`bImIJ_5PYeO}pw)V^>{>Dq@YlU&fyo<2xF#%nGTml;H z=O=p&OA3g}_Ie?xD1up-0sj{2vQTz)CwfMev1j5tm0$@EBurap4#ddtFX2k}85Y>< z?y@vENstk2`jR{_Ebl!M2>y=pDO{O7Mc zgl3yw9$AI-A0dw1Ak{{#{*EDBP~~a;y53lri!5J7-F~%2)7ygW7p3JpTQaq$g@Yg& zM46EQkdQv-+k*elJA)E!h+@>(m|gTWDc0o`Sz*tk8RyG5<_L$>BS@e}s6~Hy*DL>> zKI%ALddB*4S%`ymb;2wt197(wl4V(@)eXbNYm{5ns0Kfd|p%H zAtp`DOfzSjV%DZ1%3;sT+qcr^h=~_Kjf@ezdu>x6IlfF>YSkpd7xe4o-1F0=@6p#K z7ir*o;i~3Zq_X_lyE*o33p<>zluOUsjH+o|5S?P=OE@VyX@UgsYVmeT=8ljp)NJ4P2IJ`C#joUSorf%ZIYuzEma$+YPY|1y)Q8GSEtMv&B{ggsQ&Rm`OP0w8JsPLQ;A0=0Y#9sdPXIcJJyTfVY{rC*Cg*>l?i;sae?fe-_;M zn+&w$#5UGhGU05yY4E;39OC*2uceN=f0Nr5d3e9sfj-T*lp5g)i|4aHN;-$G1s|;F zc!hjQK~c_)?axk_j=+{&3lQwFKV4B=Gr;}Gw``|6T_z$|^m*#~RXkxP9j*o*!-oB0 zf*Q!bG74C-0Jr>p?fwBC{*>WvW3%1RkO+@rp$)(y;8CmrSOx%2oAtkcakKvmplX7? zGk8cID9O##+rWm7qwsvI@tMuSyx0)vNvCVSx+c^(q1X*U;7Bq9u~4qbS`x#@>1cE_ zVGnctqWyi3FVVevp&m`t$x8)aQJdiaKCQswTZdyoz$3YXPS8lsc(gB2jKE5tcgFTh zJyM=g691DQ7^ z)s)GEV(Y=e{Fn>#t>*f01d!EaaP+{_&3*@ zH-TqJ&PQUxm1ryo1X}$3xw)0Xqesn^ad6g$ThUF|#%`#I54S>dc)#j; zPNlYSD4hQkv_R$f9ww_&l@&uo@89{8==2xRPU`0Gy#Fvlx%lm8oW4y$2A?6;0O7HB zE$f||X^1$NXw?J3y+F;WS*W>}hcUT6ED?@K;iM-r?dx~Ft|owA0)^UI?7;UwNDco( zIET6QXJlJJf$p5X%+9OjXQ#udAxja+uVefB8S}K@8>~3Wn9sVxoCuMRsq2DX&r9j%U$wH z1?xOA>+JVCHDh0f5?Z+;z=SRWWLpBoKSqJw*qb z4L!H@+2mgUO6%c|m1L2kD<6%a7bsWix9Sg*B|%b4L-n>h;sRB(x$yM__wjT(QUn5N zTi?Sud2GjdxHVYz=P(l{gAZJSqufd_Ga68l3?eTT<)@S<%E~g17bX=JmAEa8j2MjF zP(7X>AU~TkVQV(qFhtwA@MgN)PXAOob^{A4vu>GkJgXQ=D;re0 zt*s$q14#=e1_whc4?qCbV{g?Pe~xJO4;^+sRymRu)!JQ7+)sA~b`CA;oOXQ4U#o|b zX;%R*6*@N_r(Q~@-|F%zEElX7y4@;7KP`;#^gv8C-vq@MX)H+^cH29}_8N4Z_sE%L;ArHnnQ)Hlmy}0$~&Ewjy#iR`cf)Apf1-fl_T!K-IopDpGFD zFtOX%upi}ei7b-h)^iCD;|EX#&@pYJmdHikT}5^No+$ChX!Q>*+l;puY{Gm3vnC}k zYqG1#r*a#GP<_~Z#~R(17@ECF(w~JP&zweDm+Im`(At)5$$P#SUbq=$YUVE29s@Wk zX9zeK&gxC9szkrg)99(lTisqnd&e!-IXI`gwe~4+PrVfbuu##qze$p8{EC@`RG&Xo z2sPmkb&g~*9bNxSzGb*&96?%%RATH>tZx~k+eg10CwPhujaGO{Mx|YzhXSpV$mCWo;RFecsbS?#MpzA7sfam?&{P8$&nX%y zp6keqR4g~P;+Jwxx&iG-dhKwlH@0+>yP)tI{Rd$&J9|!wn?E550ZR||1KFjQBNP(<8Yh$N~Vz|yhyv)EhSE&r!-J^ZtY*clNk!+7YIe{d zu6-9ercB{w&V@l`ZGCMDf86zv+)jI$>A>-WesTX)C~EhmM5eiaM;8)(eM~{HscNB{?d!@b zKU)>uxwCf$hF>pPsR6iWS?^-`NTv`?wtSbE!{a6D4KM5O(H>ix(c`wgKr$l$F^VKnwy^A_Dwff`N zObS1e-tWu*hT8GBY4ext0LhP;?1*PdMAd_cO`uBextn?vr;v^LHG@YxwK2_%avV0J+c)cFLte+kMJz_w8%O;?GZmi*h`Q@~u8gQXDf(YnKWP@GY8; zHbf-z_l}+4=jq`WcCN*Ep)!Hv>rNxTyP-177!$_mFR#8rvl5N(VcoISUS1|=YVgJg zggp{Lz!1NYliT}FbZ`nPth^7&UVSNml;-5GM$kVa_)aCU`rF=LVkrql;}ni5p0A)A1x$e zj25*K2J}tTqz(zgxkzfQ0p}0jhN6c9=GsG1!Bw1jtDi*58qzX(C;+h0zp;Jg>~;u> z>?Qb{>XFuK4PGHMcaOM{;;zUBz+(5DY$)oT5lFiZ-SxOK zuyr67KIcZFWL4$BqE!wDZ{+_Q=KZvL{xCPOTczC+nra=RRPPqg&FQ|i7~5=BTsPIx ziv^h#5D2U-Syr|pdZ4aJRjYBaVSY9xFJ*!W`Bpu`AjuqA;AHB-EMq$v25j$)i%9&4 zOe{>fj9y#|b2k{7BFUbA^llydd1=GBYzrr_Z5qzkWj}n)`fByM;&Lt`2eFf}AH2 z+mExjFi!$t!aupEDc18TP^zCbtRH7TUa<Ua_B{v1d63r9I*)^G1&DOKE|7eEjSLQBFc5mqQ@z=8?8+Wdx1+! z3b?dA$sY<)^O@ zpK2)f?K%Hr&i;Opu-$vb_3#mMwW*3SJ8GqB1!yw<8QxuOW3c)8yzIirR=FD#YUE$d zDhq{=`9EN7)0?OV3XZPpqbNNqSQ^8mFs5m@2VJqEf4iGLW zTQv{b0&OpFFRGo-k*7Hq|JX_XS)0AGA8hx?Hpe14>=;oAQOg83*MywF8ZY=|Fo0|6 zSr%1mSpUOj*i9qqIF#4B1m7_-aLM*%|Jo%9`Pad|)81r{T#Njf2MdNKtLBpTafr;3 zJ7cgwR5`l$N-zIO1op2s+WYK`uInv*(nexQ#jCkhomcY<^c!zn{;EUQ-F#;oe(woCL@KS2Q>68=#48W{gAtzTHJ%S-yYes(KT}LZLb%XA=C>_L*jA%_onDPg^XtgM}bBd?7kW z4A!T;a`CQJycV_%62)QzU-JVwj+h^Ba}is<(HLd(#|K+Sg~^csYiXKtCHjg57PGT; zS8@7Ec6PAJMVa&|?jjl_kPyr```h?9;vh@rX~vhItyaDd$;LicEMKa9>bgQ~#a+l# zKiK*-T=3&GHa>A5g-ZUPFkMC@5~HOmm;1$DP$JV{up@MtEQCZh{*;G(J;Fpl|m%j1}UT&x{?f#WT9Ga3Xd4Pv=Q$5LGV3xs35h(#^@MK5zZqWjU~eK zXf+tT1M!*UR#U=o57ca$;A8XZ=9$;iI<^YOAk>JOT<^|~seIj-WVkRmKkB|XdfW*| zET`8&&K#)Psd`w?0|xo;wm}>WD0EI~)y59Ja=rF~W&bl+)3Um0n^1W0fE0isu8^s! zHcbvFAGqg^bZe@Cf*KNIj|;fDp&dJ@6bG%e|JAv z9G49vWc(}&t$Nd>9@Ut)WSNp)CwCK`~Zj{_`s88@Sg7s{{BR# z4#umQ6Dg17Puja=N6-B~SmQ=G)lP((DrcR0;_W92TzRIaxg`?W>n#kV$4gSjl0gE4 z$;q*0)+Gp;%3mlEj9^8*9^d8VKM7(aJl2kJYsel)=9{2H)ugw?OpixyDR}X zX{N=1Jr*7C>5F^>spam_%O5H8yYHWO5v=O`)UmnKq6u%_@%(h@HZ*!$Ip?gad*!N?0%kX~L zop>D1m0(Go5%F42C+t#fO4Y1M|31y;WMjoW^f+dn{{xiz#8lrNLF!XNKuLO5#W)2CebvDn;~K?J;m6|144;R7omw&=1qMX zTbmfx-&R0fUM?CNySp7HSoeL2@wv$-f3w7Zfl0rkiN8|5X6a$>olc4A3K}iqQ6hdo; z_;C&1*KVe)`gt3#i&FaHTWL7uZ7vrQp^2x`Hy>kN1;@f}wd(Nkn(0%8bzU1{RqR!K z#qi#`C^W^CP4AV2duN-WlLJTXbW!=^N~_TNifD z(n>xWf!K4ZUdx?RY8Cqwe(Kp@cwW`RrHAjVk0MJ4p>)aBLpF>&(6Wj}2zf>Zzs#hqr zTp>NpFLv=_{h6WnVp_`FsZL#bp2lT4fpw_ApX90t>m6U2v4+AJ&P|wH&2^Op!vt$g zm#x0+Gso?YRa$c|_BGTvPH9XDem)Lj3sS}7RCA9a>FNVF3@eMmNa{Fs`-o)aCo#kL zTM^_xNVm6UynQ!Tv6kxsbNbdpAI5WLb~U{RSgG4Kx2+ij1`E)UX-2Vt+1FX_PQO{U zCfjghxMg#^!%t-GD>5@upbUg36)Au+d0=bzlpw00SpN;_(eg^ZopoCu--n9uXV4+m z@xCExg=b}5aacnA5a&dGLQm5)S69NASFifgV>*Z`{G-qJnIV0Ff-giE)s|x-7tg!j%?**+@)G9mSHH<0nGZBVIkUdNA|SBBhtc8_rx% zDiT%@i85g|?I(7nsr-Mad&_`0yEJRJaCdhSia>C8Cs+vX?m-K8hhQOS5sKgtJh;0% z1Sb#*clY3uQ}3LfKK=Fd+tV}s{W-r%ct5^H@kuAx3g&WY$|6B)ias7_ z`E%|P&{Pb<*W3CE926PH>I9UusRXaa-ggWq|@Jwh-}1+I0}7zv@~1r^*-A(^v0SM@iCHUwzzk34s_U z)_-wJ$o~bj#GDKIB!+uy-cyupJOS~J zUo-hA-T-3xwZGLdLq@%Gu>FQZj`HZcp^Q@-5|)pLIw)&bj4@4_)$Xp%YQ24{sW#74r1R=3djG%A3n0;;u!I!ALjJ zm2PI6zXA9Lhn^Y-+F9hm#&w<~d4V4JN4aHbYiSU8T-2r&F)$4+r;BpQ85W zIKqe(#Ud2QMIN9voHC)AtV>j`xWbs7daU$7U&z};vUQH9b?fr|4-5)#+8D)cKK=@$ zZf$x20YJ=dEEacH8ggp+U`2UZiqQ08L8~2!22V|srtS@8X6{HaQZVAyea5fPaJ%Q; z;{9UuXIIzhn?ZO!fdLB;&YbE<2YU1`MyY|`zX&Y@_f}h>-?PkLL-R<-bvZ148BR*OhA>16stPsfsCIExS&7_W z!l6_H=Cg0pyUAt4?MuD0d&dg*I)JG+u?A`xU%uMXYrJ^c*^=t22_?C$;()bRWN6}f z+ZE+H!1gLw`{%D7js+0gvaIo8k!AMSTyhRnD?y|v+VfB3a=r1G`)MXW9eHlATpj6K z{9^TpcXS}{oLqHH@rfk6$e?J$!=hGyHE;Y=TgyKXIN-8N_VmRXw}b3$wfUcw{X6Ry z^IN%NHk}2=QB)+2O*Vpp?~P7UOEU3y+&i+g)25*Z0T9oMix&xBWwY6b(ou{#_fW54 zlF_!3;`(aTBVO6wNcDjdQh8en6jGE|u}beH@L5h!s~cCgNk`l#9bvaSzDJz&(LVoJ zkG->@e0}_oqR#e}_p>c9;Vp^R$TuHPT4WSkbk8Rf!__Deg!Lf8f*O1BcWwo>Bj-%= z=Sa4?Jds;g!|%y}H^V132=Qn0eKbUmvamoiL!>)~68#h=-+k=zSsHH!=|urN^7tSN zHu@Goa;HVn1g>B_ugI?B)enYw4`ybCa=CQ!euo~Z#q=R#enKaf3rT>bB|MwAfYG2+ zslk$+K4^>hK4WrW&{PAI*%FTC*sY1=PKh^6MwzzH2%nXnzu&^!YOwDcP!NN;4Heea zre_W2k_zSyMh*x!{F|=+-?mKuH_w?foFQux#BLl$AeTcH8;KUPz)Q&)=MHqU%ZS()5Y?>`PGf|EG-bZ#+_VhE4-v zvA4*PNrq{DdoPw1_qbIDrxbpSHcaMgjO5J^Ml#N@jyiZue4Xuq@^UmAe}$}V?~Nbu zj-P2g-8GJ5&{lkzXz6bRpT8A<{+G`UJNp!U93Rn2B(YACt>}Dov$at{N%*;Uy`}7; z$dX%ekMnA&mG zURT)(Gb5zAM52{Er2UqTj~h^O0mn?bM9mYp1gcIy!~_IfKCo&{0@lAvYY`Mj4Jl%X zqP|q0Dp2rPxaR#Bi|DsN^fJY+!g@(Ua1?)p%ex|ULaTa~jjjBc%>rVev%pn(mXUZH zJ93zq*i7YH3fI36CDd<^D<`C;dLB;0Au8YVYw&tdl;csp;e?o^_@bziY#(rk+Dj{( zJs0uR7B-t%fY3e-H-MNmQnAXIM$}0|2gzccb83-Ci2$*=S^!~mSZ!GOvuv}GpJ0h= zt8S)a*Aoan%G?xE+zqw&hKl4PDzrc@*pt{PsHC)`p!4|vyPczP_WpK-vA;xXa_8& zM2EDRe62!v;@@;BQh8^4*txskTK^9dhyUw!9iYR0j`uRDD-Eoaqf#mBU$$Q*zjqmU zB}9l6A$Egs8^By#VX{HjPZPq+kZzJK8RMfjn-jad{&e7uDoy#cf^+Vx`gh{&|89Pw zu&>p9gatV+>d=10LYGJRGwB)^DYnOYP7%8Nmw zBWfNQDt`l<^jCia3UyZlqEC-jK~)XJ>OLrBLv&v z?tZ+));yeQQi)MJfpOHqxI$JA9c{~ZKK1Z7pbk6S+6SvMvg|jYL9_2-j3+6bM=2*W z@B6!2Q)&+GWCRxh0Cl%vcC;AqR-*^|%&w&@$U*b{10`UTO)3Ii>MSgRbVZZ$YCy?< zYUA{y5Nt?4iW8D_yZ~!9Hzl69HyzE9uAxiD<71}co``S`y&ezL7L`ppqw7myP@VC> zusOnGU0CbjmgaF=$`51^BmJL zR&b#m{{@99v9e}$?+L&nc16!WOW=GU5;>p7aISo-HJKrt+#oE@qt^-V+wsS@@4n?& zpLB3=pfulFugOL7P%Ja3H?_33&hp$`V!XcinkI2Hbd=4BTFAB5^1;P~jY-dZit1bNjom7wIbT_OSIv_|IWzYZP*mI#pb z`_dD_W!qQm-9TgRJ+ewITmq0!g`gB+K6FsehHVr +6{xp3F}o8D8b!~6Gn)~_;= zKBvWr&G=X$x{Ti@S_%df30flV3f}+VF1Vd}^8>u8GDkO8r|Xa=el+?+v&j}8m+qyr zn3oR;%S13j77i)QAyvUPu|df}^>V)NUTY^UiYiE(Jy-QP&UG<6sG*pC?G4(E|4tSAkog>UH76 zSe5P;s;8|is~>aOPCOHJSmRSA79m~33>rX-)Z=ZxTgM=1(KgmEz|#1C>a7+Bct^N(M_-yMhNnHe?WO zT2nNM*Kt0A0^Mtg(SToXC+0lH*n6RFs|Pn{dioWzci9wCh{tL_(gEwT3WM++{&UYl zhiL`Rql!b1yolN=Og^0b#Q|T<)2FDS;NVD&;h6ePs65XD$Mf&&D;d0 zbf(#cVmQ#^ika=Et;V&1Hfacc11uNp)$Yz0L>jWs;+B*!xjEt|1WCE%w+RPuk9#+Z z##(D<#I3Q|Smo4(Amt+0_*N`nq6Hpl#_omZeX~d@z68nT?)3m^$upJ6M?vc%X{*An z8QXKx@{^rdV-Lh?rxmXg zM1)Kan5$rRD+Zq*tEU~0&EIx?^pikkUVNXgp{K^MPXGwL*K7B+mugi6v^&Mpu5{P| zEGG?x^%DX=p0oG*c4V?{|CqRDNcN&njnoAT+hQh$vB9T^@)wJCA23&yJ*h}dU3D@) z7`(0Syl~ros#vC9$grje8E_WzMc+E{M=IIZzF&IAIpIZnicD?MMJf6r8ysu6%6|OOqI8R^wa9|_a@QS1NA+m7bitSJKqRc>u zab#}(>Rl>zPKm3Sf8cEBbIx-<2nO@m?QlICn1!Aqxyo`&P!MX<#8k4^og&$L-COl} z-as1ECkQJ^z7T$eJTxN+ppXrrAci8NS^4-s^Ukp_G{qim)YN7{=|9=Vv?^CN*C^s! zPzPgTS_5G=oBvT4*VT8mZx!j*ck}2zK8lq%;3Sq(xX5NfnN%Q#QwnW>f(j=$2fOA^ zcmMOj_StpBmBo6BG!gEkYhFHKUIgzLlFi?+jGL)l;O6t`*7%o+IZ*mM^@%aw7HpKu zPGhN`_7OmVV0}(BL?=>rrFE)gbIexVaKxlwIr?Cf_JeFn5h^opkVDpkZmRzb?F{H9 zs?UOD@k_B)mqoK8=4-(L?hW#ye%63CQ5WJw9&)XY++qc~JCfS5Kt7gbT@_v?c{0bn%!(2GXAoMprtn2shI%T$dUw zTRDFqHvohmr($_U_%|bfH=_O?glS>3|O|eq8#2`p%S#CBg|?LUEaj zO{FcJ`C`bgM;H-sdW~@=MQEHv$g$I@(v;V?Uo0#Pk&?X!uDOL|U}kF=eExquO8;4R zAvyb|^el6_x88~{`*^MUjV^{YL3ULj5~LNs0iO%c;LT<4G>cK3%%V>jbFkU<=lnsr zR3@52Z0RNY&M!YvP@tq%T3Nhg(>-F2E%+I(5d)VL`Xe{rdsY62lSAFU_Nmg3N>=C& zE5BNBuQ;S)IK!3uf)gFvGiH-$T1$ADX;963yQLv^_aji=nuet+(x`HKh+{~8lSvY0 zkkdiV#<>#3QT+7o%hH^>-d4!{ZkKkAb&|q2cexj5@?FC^}!q;2ZRoLpQs7T0o5^5_|7b0t5e&xy~ zpsGSQ7tG68Ndw#6*ifO5Ur+?$==}yblziMyvt{73NtYCs`;z*;OT^Tg334aR8dgY zv%`!D&kk(4>%+X#3Nx})V(92G}`uJdZt( zwLSn>p{52_amV>ibQObl%A_DcHKXu9#WVfqZour&*Kj`VG~(1ufmC;8U%IHK9@_dK5WbQF`L++OPR%d(s8FP6T`qzSqo$c+740Ii>_8x4Hpm`LQ?oO zvi0b4b}4iI)SjVVyWDr}IEL9!gJepIHR1)2Dy;cyVuV9Z5O8cGPgEXP$HJ1X$8=$FyAVVAiT=iOIXxh^(xooK`ZGB^ws#E#N_v?9= z9%}jRFr>Az#Eh-Aj9?BfgFR91USGDD^UNB3LGKpioJtj(=$mF?l!HFA$ro1s z0cSZ#7T`QuXL&)}ueY@_EmN%SM(;76USbUgdKCINZecN+AHIxF4L#99WTd)rX@q45 z1^-}=d{CP|I-$@w+rIwj0eT%Ff1;;H6(Tawi!@Jq*AOn)1L@{9>AN4#D}dq1Y|bbQ zd_ZYeaN6z>3U5VfLgiA>U$!C}g1ZRn=>nEzG}>U3QECI`P*$mH&YHSeRN!Gc#;~N! zrDJl<+49AC`+pEJ5#4$k`7XM4B)~6ZIfsV2@Z!kinF6{IJu8%1l}+INIj0?YS=sKl z>yE`%n1>uf<JEc)TPqk`YHZS}c*W;j_hbrn7Br9o?BSdSGt7YPI&U|ii2q?~s#-aAcPq$i- z;e}SLds_ab!B1|3QCq^3eL`2GYP?e6a54mlpd$&LOV74jqHErtHI`M8#lx5&{V`=y_>3 z%YJptVg{f(*9A{X`f+6SWw0+j(QY8cIi$U8)wYo!lb7e=aJG?n-{IN8{#EvMznEDP zVuch|EFwvfizq6j6T(1ExFIFL(Vv=U)xrt=gk{`#3z26 z2=KnID4;GfXQSR`Je+69W&cFug|$G7VsPc7i=1$}$N z0&GeGdq(ZZ|Nnd887{wJOn9zTU?$TG2Yvfbd%;%(Lgjl8!-z5K#lz3{TCo6c5$vgP(e z3Dl!^s(y9ryW8R@n+eg3OK=rC#^QV3QsEa2Q6gleY{AXbW%RsIB1 zwOInSegiUU0w~-U{D?9 z5@SY!8>Y+pcNTKBX1Jd8ltA$&5Vi2FMd+~QbXjFcD&k0`>qPi_+ZQrgx*1lPUfemP zw9LW2NJP8N-lN7aAkA1o;ta@owyd0u%_?nSM|Gy#g$?gbl!S(z9B}tL72o63ZBZYz zZx!vR-rh|#fBs&rrMaahe5q^TXOkwBw9v44rPnSeDAT}M zx-6Zf>cTIK7Kjw{rfTMreS8qEz|T-H$kf8VQJl$n~gLW zdn0=Dkz|7cOZoL!W0YxUBSee1gm>1}5Ulm3Hhf_0feL2Z2ROBedk|BXZx|>VeA;v= zA0~;RYX{FLPfzdnIrJqt#G(q5(Myxr(&Q!wyEEkiV9Uk%c_nJy1UW%jS4&!dX?aUL z?)Mue3$S_!2xpXdRzmvq5IW8|r8}g9xo(<39mH{9Og9BXZPxN-cB-X}C+1(8Uvjw; zsZpLfRHnwP4dlSW*=;9_{k3++FY6QEx7N0TuNhh0g`39OYN2&-g^78Al5$-9sUz97 zJ~onxpYH!0p822E+221}FWEHc`1Ti;2#w!pr*h!b4be$yeQtzMn1L#KT@2T*_Vxdy zHo+T2&^f19E+2Gk-F|<9=k%a zO28GW){s1MPgT}SS54Qh@Sn@$|9>ChQcCQvg|vElou?^jAzm^|4b}J(5RFGvT??`A zs~%J->djAnvU9vjvg7um>H`>xPAdW6gYI3OFO+35+Lq%>@ZV$+{Fkbv+Dtx*u`b=( zZ@fvP*qb)l=gj9Ay2fi=V*UP2pQjfjrHhS=FTcenDFJuzSqEo5{!<|Kxn|X({y~sc zM|yZ&yUFX};5_0thu$4+SbJm3d2>jUzb0XBoU3L z;$$P*0Cg8uBs`C`ozlq`moWdZ#E#x(>&?2imimCG5nzmau8Mdi}g1; zrlVk8c+Y&NmcekgM-4w~7x8GGRw^O6&*?U-6_^!?TfsFnExM*uBhFp{Kj=`@FBM<{ z{wIS^%VN_zesZfI;ACkXYl3T7Ov%WH#63|)`0c-$07R#i<-ZOE*(&lc*5AQPL4aWR zZ^gs!fx(3E3bX));$O>esg#%VV=bfQmj*@Ka?ofK0B;2lW7c3okk*?-a zT#!@!sA9K1HZZ4r7{{YIbr?e26c`?GNsvQ{JlC!*dlWjOlpW()mezn=v6gKp>P^0# z#_t2I(hW1?9_eLz5d*Z11Zd-Xf?57rJN&nk!|afnZOYvs zKw+mn&M%g4X=!OezY4Oh3UIYRq4ggf?gE6$Q%)jpYy4nHTj@?)F_16vlFtJlN5$$b zF%D5yk2*I~#E~#(#5vCNQNNz^0KEe7_hy?<7JSTa4={@%Vd{}eC+1z!3Z{5A)e1pi z@~Note$mnhS=nIR3DtZJbY-;iphLv8Q*L@J>x;m=wJQ6n9ID+*wZe+(uez+Ey#iy__isx;Qj z)G2Jv^ZfFGHE0|rN7jjIsP2#~-kHTOi(r>e><0>;^M82X0eX3GGN=^28T-;ke}kF- zGUJmTxhke*aRNA_0omDA9d0n7=sk9tGO=DLVU zTO`$$xgfA;toQBl^?7Ba)l8Y$%=3&pz5ORqX}Q>pkEvf(!6TkulHpIa=>XNc5GJ^* z0HTYchz*6+R%*%Xn{SZYv%&@5}tvV%PK`j1PS00w|_(d@?U*cv;j?L3tJ zYWX^e>X+cqy)WGwEMs3_h&k07W9(Rom5TDL8!MXQC zDC#3qziS`@dB$8?MbS4o@{R)EM%6ZRjvAS3Z#6?ih$`IApqSlCvVM=xf8U0Ii{B7c zzEr5LYpRVv6B(15hB^sGEl}119Yz3m>AR$eA>%#+(>7_DJGlWBi!0KICZ@AuV>sac zMp?6HnfE}PFL#l&nAjPcF0*NRa3`6k>!Rdd78;0Hz6O>ofmf77yNT`%bQY#0Zf5tVQSPh@oGJy4e`Z(~D%{8JYk{H-mCzOPAHRebv0jNE z=~Yy0w)&W1y(Cdu!(nNF-_YqYY8pD?GUXi*GRMux$j0e%dNFobVi zP;JLN%3@ok_|b~#rAXWHYGcLHc~)vE(T=ZiE@&f&{HL8UXHy$sfMsB%zFpugqY4*fPwBrisxuT9^$dUI5sbf2uZH)Au)7*L2Rn5+Xj*i<;OI( zCDLTO5ycF{BpKKc7#FAr4HkL4Wq|FhXX@>7NNy`U>zi6x?b5i)8-8-E<$?2Ss$4#U ze*P4DKU~o>Aw7NMgmv5=hT%WO!tOoX;sZOl+eg{xR_mc}7ryR)=I@00IXkgMxHn?i zb_RA;;wG1tSYh%4WA)pMiqlM(Eev~e2jw(rKJ8MR$RDnW`#`8DvOZo&g|WOMJpeTY zt=9T8y{ACWSy<(0*j)%kWE+ev{!+zkLpKfSG}~TPhDdJh)Impeehb=-V=yG4IDKK-Ak74W!`` zQ3}*|@gu}aLkL*CEZi~CuWmhxOpNSeOYNXLB^D))mpq59*L+huS_wXOP0V%02q+I=ziAI)V^$~(UDTO+ty7q@3aAIqXi`%5|onF+?E{rMV-9uJ8^WEYh*iN z|Mj2|YMtuo2*f84acFv>@m}Y|s-c>`>IBUp#t#X0{7U481Xn zVy%PP+nVa{t@*C~QgLXDbp%xihxg386BzzJ^Lb6v>*vI&R7WIH=25oSjyP2JWPYt+Yd)Y;0R{ zqJdnO!k>eP_p!Z(%R^}IAyNuJu)=`C3poS^faN5LI9uhWlKFu*PdmIQyGr^yAJ%$` zhj9Gz-o8SP$>9FCf>u_ha)`UJWin?M6vv8kKOWV-Eg$+8Lb96^LYx2ziaRV{folR3 znitYYWJEBp2;r|~2BoQT=&It(F^j1++Z42?`n3Zbm=G=lz#;c(hXNfHmW`SOH3KZXNX%Z)Yn@}C;@P!? z<`}r}FvW4FrA%TM8g0dVv6YI-0ynhA%N|CamFeQ;u_3rQ%_GcU8DMSc%2ExVDBS?1 z$q=U2t={!i`Hp@1?J)biPAGOTjQ#7h`zI>nyS7fYfhVHel$%eP-Kla2EL?K5sz|W} zJuM`Ww4tmF??P9HKIX)|BT2Nd<|4eLVA1hm4RuNT`ENM5ncA zlbvP*=K|qxlN1`GsRS+N@x77@aPRoW&?I_@FM+^>#Tza*l+l@v-s=39G|~N-AiODICz$=EHv)|v-Q3%YP@wRRRqD>y_WAfloyXLDw7|!CODbJ>9*IX+s@Mf zNB`=Y{Lijif2I&kheex=t_2uAQPZ~(_{^a34_`Q1^;G;2F;m_UZ?kWz5bq*T;eVv3 z4y=4CsO=-M$rdTUwxV#sIVH4k`nv72_P^n?|1+j#MbNnyKA(5!X)h^cLk*o(&I@|ZT| zm=l#@mDGukvlVsWVf3;QCH~8O+CM=N{EhbX&zi-QBK$2UkrqKNY~cmEchB^3L>JCk z9%VCfq)nz_C#j8YO_e2-AW$swgJ$cj6VmqtL;Xq|pkmZo1|<(Czn+#vuOLPwCW;27 zU?9o|$z~OT!fc|YVi7s#8~ecGE+b|-0&4MK4)y%Uu9$lr$--NE$*BtRO~ommMY08S z9!Nt80nv~Ey@bFJ3Uv?6;PP)w=)eCgbNp=q4-6iN&1(y)4?6moGQ1ejp>@&epSnYD zz2)DAFrUS36z&R(sPT-BS#+f(hesqMmwK=#*NpqTvwcM`<=QTdtqbviWXiVrl;L<8kdo!6&F z6H9lirVG146-upMkDY#wZi~E=cD^U^UwurV(DK;G%xv~!FErvu3xOqBm&BP_o!Hym z&Fz{!uz|Pns(R?N7xJQNQ=`nsuR1UCB^E!0s-+;wa`_VV1N;hz`uoqZDX0|XrbW~W zjP}wBi?f?!5?$aDMz_nPG}oFbbkE=;$av>;R~HUET~>6By1ndQc`6}>zHz|ECUtg} zqzQ*XzWy6Z(LeqBpW6dbHfUn92qvxY`g!gLUS1Xk0O@-S3fF>%9ztVk;pUshxtal! zXDHX=k6JVfG>*RkiJ_;*+4itp0Syh?ZveZ~W9@#-x%T|6Yd2>uB z=39i^cmM*Rd*a;&X|ilfJC&zQ}*+aI&7 z=j*bAwi*-jXgPG$^J7A}KkV6mKkA^f&BCiXN6y`_AEHpt)}q-XR>DwrLFAYgc9B*lzHN` z;?d+Br;i-p?$xOV$3gKXwXB=BOuQ0Q{b6QD!|+-NV>M=11rV3Z|99_(*jt0?> zyR;JDMi_;02lL}4=H@(No_n_D_FHTinLNCj>(Sxk3&-D!=eA7=AmE=NC|9lOWCeOI^~rH8an9DRg)ntooYP89=&O1k-IJKIGmG_TTShx?Yn& zW8+F~K-hWT7#6{(aa&+XI*UTgw>3+TmX6SMD9qh$WaBrwP>!>dA)*FKb1RnUjbh8xrkl9Ji5{Njy z6-aHJLt94}5h8!^##7rcm)UudD<_xQR_z`U>sx1E54$maI~}Q7#~_B{&`sC#Oi=BS zXEjoXHL4rfD> z{_*8e0`t@MQX5siDsni{;LqEao4seYo?@i))-yb%2F(o3b!jg=uEV^rwwVRLzXe!P z1hpqYh6P={Z%odLB2TVxsJ!3HgQ4rD2AMBq(gb@+o^}RIEhOua4YyJ#v4aunnqj*N zQJFF}r?rTE5cSp6Dg~_+g8bmu3Kfl*wawiAPZ);*D#{I=wMpOWTRy%DzfNo>g)c%$ zOpwzbjM!EDA!16swPtFL^C{Vj8F=3Hl-oh&?aB|os@IQc-p+z@ zIze31!Zy2L)-tKDr50eRCPS{R2z9L*G(?1)f4WfoWs&&?i<{IwPd$%67}7Xiz-UUf zJ|DhcTs8)-w7L!WzAw|sybaly45YQ?=hvBoq#2DJ9&nHP@o>U1Ewne_rK$E=jEqE0 zuxG-G8GtB$^m!0o;3U+jFcX-D9UtIDrwGI~GAc@T*p_2KrBA2G6GqjDAY*bdwk>~D zJm3BFt6d7Geu%Y#^>N?9y;ajPZo##tE)EUCnQRy#@M2tE9RViqZJ5w_oAi+28XUIN zdDf1HZ^p=8@J+ga@e~jQ0U6dPR9HU9r4Lee`dL7V{!kDHB$t#da{G{P3nr7nva7Y_#2Au!Ao}d#0jro)=@U zVc5-rG8_+&ln%iz$LhnEd$<@*+6$d25#BRvow+NJxXC$EL%Nl!&h6Jd3+j?b)by)7d(Lty~)04b<6Bi?v+8ZW|K6c8^lU*?g`cVe66gx7WoR7(gI_>qvYX%F^B z%mVC`(R}pOx4y2+Vw`cRV>V!wSXf+EL5!zb6TWV9lK+z-@721?m0Pj?;~c*c%Glj_5{+^XFgX01yU=fSWdwS zOj4y7i$?!5xDpwS8TV4EmoOLZgU#`VHJg^GGvEXfxB>VU+x zr^;k9z=Z`aKxF3wV*IMMJh~EwQCd@r5}-4M)SsRBCxNeK?6jd08%R7QHbb0cJ$FJI zHOz4Bje}|8&UP2Lr#@cV>}7f;>x=_YI4>QfX8iHugFIyBtpc0agcDYsnj%Y$LN-0c zxTEYgx{*})GFVWHmPY0fh9X%%7CgQ3yJacYq4S@J?SuC$J{Svj&I4|S7|U(MLz(5P zy&BI}vR@TGkYbb!N6_}mP7}tSzyWbR16I(+?T18QLm2P&VB|)vdU|Wtm^qSQiK3EC zZ#xkUu`MhQ(F(K6Y$APli|D!&=MghnRR2Nq+_>_o@>Kx=#i*{@@l>HLv8?daz&2(P zpqL694nA0<1_tWQ5;}iXVoh;)V|P4!(B)U0EF1UD_HN#|rS^g?$#LwB+R z+Z*Pu!5AdU@YSnt!UWqjl!I8i!Mv9wbZ&&PiBeY1{)2)jOkzpX{>-^H5Z{3Eh*U(E z{T1teST<{`>O!aIM2;d*`)rzl8H{U|;2`p9nSZ#&bql0DqkhU?as60MnONxieMh*HEte7W z0JFwekxg%ETAh#^UyEZ~lj(+m?vH~pcy0>{ZPN@jZU@Fsdv(BuN{*}*FTtieqRp1> z@3_q7ce)UF$q*QtBD@_{+wcdOP%K$uV0}BL-njJ;L{Hl2SD_T2(L~fnLVm#Z@o>o> zUhg)UqKZZ#H$CaB|H%7k&%E9)b=nJsR63`Sc+J%a*TqnM3B|CIWLtHvRGYPuN3oT0 z+4tBqqR?8x2{Ul~P!X<+e;Qm?dzrh2-G7xHi^ZjbN4NWeNGD!nZqV&vT zimhn_rkX;MX22*VL@H9VuzcjBXrbMTO49YAe1(7cVuO>!A}sY`P1cozh0dIEF5V}O z;Vt{Rv!2%mY1WW}QCC6Gy!EpKWrZw<0mjfU87xKcFBFK32zx@8K?IMEXI_yTX~G7p z$9`MurZsm7;MNeXr?KWPdPTgfNEB+c;uY`84U~`W7$DgVC2b|CRja6`s5eohb*>6E z2we(z;~QLW-s*Wi)l0 zkpnrf@w~D>i}7+G*$p|wG+o^cUt}7nZbq*2S=RGg1F|Obdl>-dmc<^-gQ*hB8_Ra@ zXQcg1aoaIQb@H|FnlzW0MdPg%ODh>AcfNJTCfM0z?}>SRHQXH}d;L7|cI?M5b!pEV z68wqXyaYpm#4mgs$ZqrJywj41d=2LYi~W=wZ$jwec+rb7L zPyPCZmU@a3E!tDT5{PZ_)H{0|O+9Ogqm+I!72$b554aEbe6?cRe?7DvegRDZlU$Y- zzhGu0piOA#|6>slIj}Z&Kk&s&Rz&GPPHAW^WY8S4L{}`(Id!$m+x=Bbvo+eDg*D{0FP{(z{i2%0t**DM%udMK)`m z#P>(hDQg}?;AVsmvt=+?3ewBD_WDpii|RZgWMkA#>|=?Xd&n|}TAsP;3kT+4d1`aZ zkoYg!+b}{d&Lpq>GT+6eQf$!-8~MRhB9xP|Y<=M@ADR9$=a8V_6%SmqjL+RIK24HCF?Wc~$A`{!H{p(LI zZpFiRe87ZM`GU2f1%J3Ye9ai^XutvEx9dL#n`y%=)Fu1>4L9;n?^%^D7EWHG`7?tZ ziVE*0srnpTW64B3?u;VEI+`j8ZXecJ{WJoEZe$|$W?8pANwl5iFJ&mBb7T)u$EP)$ zfSArg#KP?`e(aej^O|fy&^HB_awC1)w15Mi8|`~Fo9(3snxjSKy)d<*{1S@VI^Wo= zlAK4evz3rS+c}s0N#o%n>!bOUDmmbOD!F7NlrzFrmNUNG;~$R;xPK%PYo79C@~bx0 zH?_|kvsu>#z(C^Ytx^jpjPfHH#e&oyreVhjQ^I2WA~Z$beB&Dv@3QWWtv(M^^7FI4 zG$;Wc#j2n;!;lo-yE_aqI_LZm?7W#8${qD>A!1`J)Rr23DkpP0WZ9!uboTt<<|Rt1 zXyotJqJO7s=&)}DVTnA$rOQpe@2F|2ts6+cW*!Ld9rzN9v70P_?nQk!M2j~I>p3!c z>#XBMx5?y;&`*PS=IfhWXs*lfyDsk%g9lw7Z)3i4!M7@SZLO7D2Vp!?P5O)axdr*S zd6gTjEPkn!{fe(!WWG-))|0>M3(qwcukKU%2Ie*VXa4ekQL)<8Pa{VV4{R{bpan2BZ;ajY_k{jsZPGUzknRP`uL?{ zP4i4ELXt^VvRk`bz)HX@R|a5lYw@mann%Mvh=;(4clk%}FnqG?r8QVG9GKV}$K^7^ zQ1Y+J=5c>kKIxs^Cdp*K5t_Dxd{D$5z8?Gx)YIY2POai>GNq1;v3{99S2uASzybZ( zmP)K!@iL)etw!eMPBs zk(lEes_ys%YqPo6Zvg7#OdE!qS@A9Gj5}b~qD*5PeMp`&6=XSQB;-{;ijTu*>tVXU zB}TyBi*vRQZr`Zbu$+`qRI!IZ`1|ClqnW*?fNzGs8gP4@0c^ z7p_J2j44i#?>I=(_^76`nw!~^Z5U4tk0IBch3@dxTIwj6KdmM3@i{Dlw?D#3ZB^4^f|;o|2k`Xt zTaEY0lk3;KAhzXoFJgi(7@abRhFAbr%jTDFXX`=a_k*gc*E-r$ z92Z+Mv2)t7UoL7(Ih6W{(9uNZs0-EtG>i-e=dM~Q7P5zl6EoQzi1%vxS)!)y@<(b& zEyi~?Q4rx}01k%#?)enVDUD}^)fQ~oq7iwy3&U1K{Vg)cG5aVf+;4XAK7~YJ;R%jo zpQMuUo^<>_?VWiv6zm?yr-UTSOBhV1?1>R$-}9oG#*)f9lwFJ=vQ09!#MsIZjU{AX z!wkkdHDpV&PK=CHQoQxznvo@g?%Z?ky??#;+}k;w``7#LbI$Yp@jK6X&hvbp@9+DS zIvauFe!x7J`o1N_&RsufzBAQ)JoO2&^QaG0n3K5qVgcv|h>)^v32Cr2?9S<*I;&vce!k z^-cT?06hJFdXfLv|CHFng2jAAGDALU4PIMe6`*FnWpqs^tULDI1}U3})9%@YapM3A5;)LeO6HiS2#;WG*%3We{(wAN(4~v{~?lryt^gPwe z?f7#*^RrPi&NIWQxs5GYjP~Eets14`)eYviUMP<9ECnxz;$!~#qx4_Y$_;Vbh!g2s zy1zwidZ-NliZia`Z?(rVCS1=#X| zGNtrBU2!_?cY~ofLpg9&S>>71EIFAay)bzo;qWh*+bj*Tm~;t;(~a&=a5$akKR6xf zThdDwlY(6>3?04c)%lGiBy3XdTc`Ncdvja%p~QOlz_rNpo_ZFdry)l7u>QC5gOdPG zcf0X3cGY9m(baAp`=g{RI(@3rill;T#tm$eSSGB$2?+a8V z5*Ot_3DSn(fF%?F_Vr|L{Lm0-iD^la;8?{ya;KJXJy?y<;@gY{fzEg#F-M~P98^*w z7p1}ISpIXY{kKm1y+pdmiRGQkQlaa4^A^bF@iSPBX&U zv61|-2Uk}X$t`T2Z}Mqwg!%Q1BGn9<&U;EO^#I_2nFMc!ESu+X4K=&=T=i#M_*nQ` zxJxFb)!-41e@XfEn>xKOdW|+vM;pWlZl#*|B!vfsD5m;&65FVbzP>>qO}r-1Kdc#4 z@)L6a-V5Ng*}x5Q_#>JYi?bV@l<~`&hY1w@Oc&{!UbAGs<`lcXvS0q@d zsl4_@_2>0KUx*)aiNuW^F;or^%T0KCDT8I!WV`htxZL~w+f zc{WZi$HeEnltuL5Vdv&IFC^WPH#l00oQYj0?eBo}Rjq)LM`T2`cx|?EU9YB9m0M1> zCOO-7l7o`s4C}$quqITdF3yvfznsWeutqUOdRv_ZJsFt}(r3`xx@ip4}w zjvLxPXyw)f@K@vEjMNlQkCwdG1dTF7vr+xVBLEiJDE0%Y5ZGq1KY)O*9zq?u*Q%w= zh?|X)boI!%aGZRS3;n5__w*!ky9w3B<@$%?A`O{dnA$M)N=-O>)sni)HRx!VR)6oh z`CzYz3@7J;$>wyXQ%}4yGExurp=C}o9I9#oNJSV^#n#$-EUdKM+ZwGfIM=T8Y<3(B z{8F-(6Da)rqg(7hZ2+(`(j_&pS3GCNhui+E@Wm z^Jh)kT<`mvM;Ap6VNy-F>Htex?b+WnkRWg_0gezPw*G6 zl619cc*Dmf&~7@fI_+~8*-_%$LPW$6Ke`Vl3PGSz7-6`gFJ9}z%$XG#!N}N6p)2?D zbeLbDR~w>#D^Q-#-U1VZJMFWIB*E*OVqWnYUu~RLu1F-=7um99(W??80qr7H7OY83 z8awLbW$F95$&&q;8Rq`uh{}MS+JcF4A`_e)w)y&(rv)=wMCjqg`_6Uw9%8c(rZaod zH!T?4MLVUFSI{uq<5#@`o%qC^fO|Bub?D%1(&&vGj#hqQM~1rPN`K6o74hP0J-iZ8 z5A>y?M6%Y*Sx92N9QKnh&U}~hk|O2@R!m9Vg-5$AQ>Sia&-OKfG&M+t7x!qe($+c{ z@%*xPmR`@T@lo5gDyfYlW0POiz(`hC8>wFUJ3z_gDr1G_P8bL!gZ#OwOG^cXZ0S$V zalcL0>c9sw8=tamRC+%Z{t-cLKWoqr9y(G_teRVdbq`3nLE`T0`q1|UDTrdM>55q% zRE*r|0+DOial9L Date: Sun, 2 Jun 2024 19:07:29 +0200 Subject: [PATCH 16/33] [tmad] regenerate all processes - add volatile in aloha_functions.f to fix SIGFPE #855 in rotxxx --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 18 +++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 ++-- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 14 ++--- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 10 ++-- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 20 ++++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 18 +++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 10 ++-- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 24 ++++---- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 12 ++-- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 24 ++++---- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 14 ++--- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 22 +++---- .../Source/DHELAS/aloha_functions.f | 2 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 10 ++-- .../CODEGEN_mad_heft_gg_bb_log.txt | 16 ++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_heft_gg_bb_log.txt | 10 ++-- .../CODEGEN_mad_pp_tt012j_log.txt | 60 ++++++++++--------- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_mad_smeft_gg_tttt_log.txt | 20 ++++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_smeft_gg_tttt_log.txt | 14 ++--- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 21 +++---- .../CODEGEN_cudacpp_susy_gg_t1t1_log.txt | 10 ++-- .../CODEGEN_mad_susy_gg_tt_log.txt | 14 +++-- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 10 ++-- 32 files changed, 211 insertions(+), 190 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 01558e97fa..98652fbd7d 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005818367004394531  +DEBUG: model prefixing takes 0.005783796310424805  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -199,14 +199,14 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.213 s +ALOHA: aloha creates 3 routines in 0.220 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.280 s +ALOHA: aloha creates 7 routines in 0.270 s FFV1 FFV1 FFV2 @@ -228,8 +228,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -250,10 +252,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.985s -user 0m1.747s -sys 0m0.212s -Code generation completed in 2 seconds +real 0m2.429s +user 0m1.885s +sys 0m0.270s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index b0fc131d4f..0901cfe618 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005814552307128906  +DEBUG: model prefixing takes 0.005842447280883789  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -184,7 +184,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.285 s +ALOHA: aloha creates 4 routines in 0.287 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.689s -user 0m0.639s -sys 0m0.045s +real 0m0.772s +user 0m0.647s +sys 0m0.048s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 92993ac924..7b00f92838 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058269500732421875  +DEBUG: model prefixing takes 0.005838871002197266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -198,12 +198,12 @@ Wrote files for 10 helas calls in 0.108 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.154 s +ALOHA: aloha creates 2 routines in 0.157 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.140 s +ALOHA: aloha creates 4 routines in 0.142 s VVV1 FFV1 FFV1 @@ -241,9 +241,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.091s -user 0m1.719s -sys 0m0.267s +real 0m1.994s +user 0m1.722s +sys 0m0.273s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index a4dc00f87b..060ebb1d1d 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005820274353027344  +DEBUG: model prefixing takes 0.005850076675415039  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.635s -user 0m0.505s -sys 0m0.048s -Code generation completed in 0 seconds +real 0m0.562s +user 0m0.507s +sys 0m0.051s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index b3f5ed282a..63e5119ac4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005791902542114258  +DEBUG: model prefixing takes 0.00590205192565918  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -188,7 +188,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -205,7 +205,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -221,14 +221,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s -Wrote files for 46 helas calls in 0.259 s +Wrote files for 46 helas calls in 0.257 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.348 s +ALOHA: aloha creates 5 routines in 0.347 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -236,7 +236,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.330 s +ALOHA: aloha creates 10 routines in 0.328 s VVV1 VVV1 FFV1 @@ -259,8 +259,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -285,9 +287,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.666s -user 0m2.189s -sys 0m0.225s +real 0m2.751s +user 0m2.448s +sys 0m0.300s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index f2938af2d2..917aa6b5ee 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005806684494018555  +DEBUG: model prefixing takes 0.0058138370513916016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,14 +194,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -Wrote files for 36 helas calls in 0.157 s +Wrote files for 36 helas calls in 0.159 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.345 s +ALOHA: aloha creates 5 routines in 0.347 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.331 s +ALOHA: aloha creates 10 routines in 0.330 s VVV1 VVV1 FFV1 @@ -232,8 +232,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -254,9 +256,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.562s -user 0m2.060s -sys 0m0.221s +real 0m2.570s +user 0m2.293s +sys 0m0.277s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 2650bec87e..c27b731327 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005831718444824219  +DEBUG: model prefixing takes 0.00583338737487793  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.345 s +ALOHA: aloha creates 5 routines in 0.346 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.852s -user 0m0.759s -sys 0m0.056s +real 0m0.869s +user 0m0.766s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index d479b476a0..e2c6e055a3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00582575798034668  +DEBUG: model prefixing takes 0.005956411361694336  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.169 s +1 processes with 123 diagrams generated in 0.171 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.460 s -Wrote files for 222 helas calls in 0.741 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.456 s +Wrote files for 222 helas calls in 0.744 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.355 s +ALOHA: aloha creates 5 routines in 0.354 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.338 s +ALOHA: aloha creates 10 routines in 0.334 s VVV1 VVV1 FFV1 @@ -235,8 +235,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -257,10 +259,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.505s -user 0m3.236s -sys 0m0.224s -Code generation completed in 3 seconds +real 0m4.109s +user 0m3.737s +sys 0m0.281s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f index 7394e761fe..d0ec1dbde9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 89a9e25c18..db0459398d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057909488677978516  +DEBUG: model prefixing takes 0.005809783935546875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.170 s +1 processes with 123 diagrams generated in 0.168 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.451 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.458 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.544s -user 0m1.463s -sys 0m0.054s +real 0m1.536s +user 0m1.466s +sys 0m0.061s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index da6acfa84a..b4219bcd39 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005862236022949219  +DEBUG: model prefixing takes 0.005804300308227539  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.007 s +1 processes with 1240 diagrams generated in 1.997 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,15 +195,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 7.000 s -Wrote files for 2281 helas calls in 19.666 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 7.030 s +Wrote files for 2281 helas calls in 19.751 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.337 s +ALOHA: aloha creates 5 routines in 0.340 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -211,7 +211,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.330 s +ALOHA: aloha creates 10 routines in 0.332 s VVV1 VVV1 FFV1 @@ -237,8 +237,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -259,10 +261,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m31.035s -user 0m30.488s -sys 0m0.422s -Code generation completed in 32 seconds +real 0m34.977s +user 0m34.181s +sys 0m0.473s +Code generation completed in 35 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index b0d9872611..2b891973d3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005834102630615234  +DEBUG: model prefixing takes 0.005815029144287109  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.007 s +1 processes with 1240 diagrams generated in 2.016 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 7.027 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.983 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.374 s +ALOHA: aloha creates 5 routines in 0.372 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.929s -user 0m13.764s -sys 0m0.108s +real 0m14.131s +user 0m13.685s +sys 0m0.117s Code generation completed in 14 seconds diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index c685148505..fb1c56fdea 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005826234817504883  +DEBUG: model prefixing takes 0.005858421325683594  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -201,7 +201,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -218,7 +218,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -234,16 +234,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s -Wrote files for 32 helas calls in 0.234 s +Wrote files for 32 helas calls in 0.235 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.154 s +ALOHA: aloha creates 2 routines in 0.156 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.140 s +ALOHA: aloha creates 4 routines in 0.141 s FFV1 FFV1 FFV1 @@ -262,8 +262,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -296,10 +298,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.041s -user 0m1.799s -sys 0m0.227s -Code generation completed in 2 seconds +real 0m2.457s +user 0m2.036s +sys 0m0.309s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 497eedfefd..eeca578667 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00579071044921875  +DEBUG: model prefixing takes 0.005826473236083984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.083 s +8 processes with 40 diagrams generated in 0.084 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.690s -user 0m0.614s -sys 0m0.065s +real 0m0.693s +user 0m0.629s +sys 0m0.059s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index 534d00c329..a37dd33635 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -150,7 +150,7 @@ INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Creating files in directory P1_gg_bbx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -166,20 +166,20 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s -Wrote files for 12 helas calls in 0.111 s +Wrote files for 12 helas calls in 0.110 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.280 s +ALOHA: aloha creates 4 routines in 0.279 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.264 s +ALOHA: aloha creates 8 routines in 0.263 s VVS3 VVV1 FFV1 @@ -199,8 +199,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -217,9 +219,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.047s -user 0m1.806s -sys 0m0.230s +real 0m2.443s +user 0m1.976s +sys 0m0.266s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 8eeb803fc7..9d498faa46 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -157,7 +157,7 @@ ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.277 s +ALOHA: aloha creates 4 routines in 0.283 s VVS3 VVV1 FFV1 @@ -174,7 +174,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.723s -user 0m0.625s -sys 0m0.046s -Code generation completed in 1 seconds +real 0m0.887s +user 0m0.624s +sys 0m0.057s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index b26d47fafc..c999c8e3d4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005836009979248047  +DEBUG: model prefixing takes 0.00587010383605957  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.146 s +13 processes with 76 diagrams generated in 0.145 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.970 s +65 processes with 1119 diagrams generated in 1.957 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -500,7 +500,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -517,7 +517,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -534,7 +534,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -551,7 +551,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -568,7 +568,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -585,7 +585,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -602,7 +602,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -619,7 +619,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -636,7 +636,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -653,7 +653,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -670,7 +670,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -687,7 +687,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -704,7 +704,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -721,7 +721,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -738,7 +738,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -755,7 +755,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -772,7 +772,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -789,7 +789,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,15 +804,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.375 s -Wrote files for 810 helas calls in 3.466 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.373 s +Wrote files for 810 helas calls in 3.476 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.357 s +ALOHA: aloha creates 5 routines in 0.365 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -820,7 +820,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.333 s +ALOHA: aloha creates 10 routines in 1.909 s VVV1 VVV1 FFV1 @@ -846,8 +846,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -1030,10 +1032,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.404s -user 0m8.903s -sys 0m0.418s -Code generation completed in 10 seconds +real 0m13.190s +user 0m10.662s +sys 0m0.902s +Code generation completed in 13 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index e171469df6..249702623d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.14738821983337402  +DEBUG: model prefixing takes 0.1493072509765625  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.935 s +1 processes with 72 diagrams generated in 3.966 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 INFO: Creating files in directory P1_gg_ttxttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -131,14 +131,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx Generated helas calls for 1 subprocesses (72 diagrams) in 0.199 s -Wrote files for 119 helas calls in 0.442 s +Wrote files for 119 helas calls in 0.443 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.339 s +ALOHA: aloha creates 5 routines in 0.338 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -146,7 +146,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.355 s +ALOHA: aloha creates 10 routines in 0.353 s VVV5 VVV5 FFV1 @@ -169,8 +169,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -191,9 +193,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m7.305s -user 0m6.946s -sys 0m0.253s +real 0m7.672s +user 0m7.285s +sys 0m0.320s Code generation completed in 7 seconds ************************************************************ * * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index afe7467840..aba71fa3d5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.14697813987731934  +DEBUG: model prefixing takes 0.14808392524719238  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.942 s +1 processes with 72 diagrams generated in 3.955 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.199 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.200 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -123,7 +123,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.334 s +ALOHA: aloha creates 5 routines in 0.335 s VVV5 VVV5 FFV1 @@ -143,7 +143,7 @@ INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SME INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m5.448s -user 0m5.330s -sys 0m0.058s +real 0m5.428s +user 0m5.332s +sys 0m0.075s Code generation completed in 5 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 381ba3a551..16c822599f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.128 s +1 processes with 6 diagrams generated in 0.132 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,19 +592,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s -Wrote files for 16 helas calls in 0.116 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s +Wrote files for 16 helas calls in 0.117 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.194 s +ALOHA: aloha creates 3 routines in 0.198 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.189 s +ALOHA: aloha creates 6 routines in 0.194 s VVV1 VSS1 VSS1 @@ -626,6 +626,7 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -646,10 +647,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.122s -user 0m2.790s -sys 0m0.322s -Code generation completed in 3 seconds +real 0m3.624s +user 0m2.849s +sys 0m0.311s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 4953f08208..428621a368 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.131 s +1 processes with 6 diagrams generated in 0.130 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Load PLUGIN.CUDACPP_OUTPUT @@ -583,7 +583,7 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.196 s +ALOHA: aloha creates 3 routines in 0.194 s VVV1 VSS1 VSS1 @@ -599,7 +599,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.421s -user 0m1.348s -sys 0m0.062s +real 0m1.520s +user 0m1.334s +sys 0m0.076s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 2fb4d8a715..abc4abb141 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,7 +592,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s Wrote files for 10 helas calls in 0.110 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 @@ -620,8 +620,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -638,10 +640,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.805s -user 0m2.548s -sys 0m0.244s -Code generation completed in 3 seconds +real 0m3.607s +user 0m2.713s +sys 0m0.297s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index a6c54f90b2..9f7d8a4530 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.125 s +1 processes with 3 diagrams generated in 0.128 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.365s -user 0m1.280s -sys 0m0.067s -Code generation completed in 1 seconds +real 0m1.906s +user 0m1.287s +sys 0m0.069s +Code generation completed in 2 seconds From 8df147d805b82d03e3eb5d9f711bfb0397e19c76 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 08:24:17 +0200 Subject: [PATCH 17/33] [tmad] rerun 102 tput tests on itscrd90 - all ok STARTED AT Sun Jun 2 07:28:58 PM CEST 2024 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Sun Jun 2 09:05:36 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Sun Jun 2 09:23:42 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Sun Jun 2 09:31:49 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Sun Jun 2 09:34:34 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Sun Jun 2 09:37:17 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ENDED(6) AT Sun Jun 2 09:40:05 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -mix -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb -makeclean ENDED(7) AT Sun Jun 2 10:08:08 PM CEST 2024 [Status=0] --- .../log_eemumu_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_heftggbb_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_heftggbb_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_heftggbb_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_heftggbb_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_heftggbb_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_heftggbb_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_smeftggtttt_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_smeftggtttt_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_smeftggtttt_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_susyggt1t1_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_susyggt1t1_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_susyggt1t1_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_susyggtt_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_susyggtt_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_susyggtt_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_susyggtt_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_susyggtt_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_susyggtt_mad_m_inl0_hrd1.txt | 86 +++++++-------- 102 files changed, 4722 insertions(+), 4722 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 8f9275e4c9..de0caca761 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:32:11 +DATE: 2024-06-02_20:48:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.832141e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.963737e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.194654e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.447081e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.931434e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.173062e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.663591 sec +TOTAL : 0.676046 sec INFO: No Floating Point Exceptions have been reported - 2,552,888,166 cycles # 2.852 GHz - 3,988,214,096 instructions # 1.56 insn per cycle - 0.955666761 seconds time elapsed + 2,567,759,777 cycles # 2.819 GHz + 3,947,530,526 instructions # 1.54 insn per cycle + 0.969595478 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053499e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.235255e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.235255e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.052568e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.236916e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.236916e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.389071 sec +TOTAL : 6.401690 sec INFO: No Floating Point Exceptions have been reported - 18,286,976,651 cycles # 2.860 GHz - 43,966,894,713 instructions # 2.40 insn per cycle - 6.394304827 seconds time elapsed + 18,320,184,384 cycles # 2.860 GHz + 43,970,344,438 instructions # 2.40 insn per cycle + 6.407522814 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.575090e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.062276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.062276e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.556597e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.031950e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.031950e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.417510 sec +TOTAL : 4.466648 sec INFO: No Floating Point Exceptions have been reported - 12,744,894,916 cycles # 2.883 GHz - 31,001,019,523 instructions # 2.43 insn per cycle - 4.422588286 seconds time elapsed + 12,746,464,526 cycles # 2.851 GHz + 30,998,051,748 instructions # 2.43 insn per cycle + 4.472203598 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.946862e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.717949e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.717949e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.919243e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.664717e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.664717e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.650052 sec +TOTAL : 3.699016 sec INFO: No Floating Point Exceptions have been reported - 10,049,299,034 cycles # 2.750 GHz - 19,366,983,583 instructions # 1.93 insn per cycle - 3.655131055 seconds time elapsed + 10,057,139,705 cycles # 2.715 GHz + 19,364,699,903 instructions # 1.93 insn per cycle + 3.704443201 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.022084e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.853820e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.853820e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.993729e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.809982e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.809982e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.526182 sec +TOTAL : 3.577215 sec INFO: No Floating Point Exceptions have been reported - 9,727,313,452 cycles # 2.755 GHz - 18,976,774,064 instructions # 1.95 insn per cycle - 3.531366474 seconds time elapsed + 9,735,076,070 cycles # 2.718 GHz + 18,976,322,211 instructions # 1.95 insn per cycle + 3.583082575 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.695166e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.233065e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.233065e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.660221e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.180702e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.180702e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.130967 sec +TOTAL : 4.214341 sec INFO: No Floating Point Exceptions have been reported - 8,580,931,991 cycles # 2.075 GHz - 15,727,945,386 instructions # 1.83 insn per cycle - 4.136130895 seconds time elapsed + 8,602,295,276 cycles # 2.039 GHz + 15,727,245,583 instructions # 1.83 insn per cycle + 4.219911758 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 556a164c58..7ea10d000a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:59:51 +DATE: 2024-06-02_21:26:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.482485e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.592798e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.592798e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.576625e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.738660e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.738660e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.321030 sec +TOTAL : 2.288746 sec INFO: No Floating Point Exceptions have been reported - 7,329,722,152 cycles # 2.840 GHz - 13,178,162,400 instructions # 1.80 insn per cycle - 2.637544426 seconds time elapsed + 7,214,350,790 cycles # 2.846 GHz + 12,908,181,952 instructions # 1.79 insn per cycle + 2.590853100 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,15 +90,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.015910e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.185158e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.185158e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.021503e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.191283e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.191283e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.819356 sec +TOTAL : 6.785821 sec INFO: No Floating Point Exceptions have been reported - 19,469,024,508 cycles # 2.853 GHz - 44,194,459,972 instructions # 2.27 insn per cycle - 6.826135735 seconds time elapsed + 19,486,737,909 cycles # 2.870 GHz + 44,194,389,028 instructions # 2.27 insn per cycle + 6.792710044 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,15 +119,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.484528e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.911785e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.911785e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.480951e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.909694e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.909694e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.870031 sec +TOTAL : 4.890119 sec INFO: No Floating Point Exceptions have been reported - 13,928,289,994 cycles # 2.857 GHz - 31,840,505,402 instructions # 2.29 insn per cycle - 4.876819018 seconds time elapsed + 14,036,550,987 cycles # 2.867 GHz + 31,841,545,843 instructions # 2.27 insn per cycle + 4.897057684 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -148,15 +148,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.770728e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.386706e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.386706e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.794479e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.441373e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.441373e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.194561 sec +TOTAL : 4.161489 sec INFO: No Floating Point Exceptions have been reported - 11,361,909,372 cycles # 2.705 GHz - 20,728,193,515 instructions # 1.82 insn per cycle - 4.201564491 seconds time elapsed + 11,384,673,501 cycles # 2.732 GHz + 20,728,132,603 instructions # 1.82 insn per cycle + 4.168544668 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -177,15 +177,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.837895e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.511052e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.511052e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.875462e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.582851e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.582851e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.060412 sec +TOTAL : 4.002181 sec INFO: No Floating Point Exceptions have been reported - 10,967,372,142 cycles # 2.697 GHz - 20,348,024,135 instructions # 1.86 insn per cycle - 4.067336299 seconds time elapsed + 10,994,800,793 cycles # 2.743 GHz + 20,338,605,981 instructions # 1.85 insn per cycle + 4.009209282 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -206,15 +206,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.574771e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.036722e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.036722e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.543831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.990093e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.990093e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.642745 sec +TOTAL : 4.737577 sec INFO: No Floating Point Exceptions have been reported - 9,876,735,144 cycles # 2.125 GHz - 16,873,564,045 instructions # 1.71 insn per cycle - 4.649693422 seconds time elapsed + 9,979,903,374 cycles # 2.116 GHz + 16,882,096,595 instructions # 1.69 insn per cycle + 4.744772163 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 752636bf13..5164f42c9d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_15:10:56 +DATE: 2024-06-02_21:37:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.514734e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.592124e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.118434e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.007128e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.848481e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.122082e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.385277 sec +TOTAL : 1.379492 sec INFO: No Floating Point Exceptions have been reported - 4,585,820,337 cycles # 2.836 GHz - 7,177,605,134 instructions # 1.57 insn per cycle - 1.675534023 seconds time elapsed + 4,577,420,690 cycles # 2.842 GHz + 7,053,134,533 instructions # 1.54 insn per cycle + 1.667006809 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053610e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.236408e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.236408e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.053204e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237226e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.237226e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.764205 sec +TOTAL : 6.775998 sec INFO: No Floating Point Exceptions have been reported - 19,353,708,257 cycles # 2.859 GHz - 44,070,957,602 instructions # 2.28 insn per cycle - 6.769682162 seconds time elapsed + 19,412,850,180 cycles # 2.863 GHz + 44,070,335,531 instructions # 2.27 insn per cycle + 6.781477582 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.538542e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.015186e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.015186e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.545449e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.023167e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.023167e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.882143 sec +TOTAL : 4.877650 sec INFO: No Floating Point Exceptions have been reported - 13,858,733,837 cycles # 2.836 GHz - 31,001,638,282 instructions # 2.24 insn per cycle - 4.887574523 seconds time elapsed + 13,891,433,885 cycles # 2.845 GHz + 31,001,668,128 instructions # 2.23 insn per cycle + 4.883211371 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.916460e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.668694e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.668694e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.893576e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.631253e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.631253e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.073472 sec +TOTAL : 4.124694 sec INFO: No Floating Point Exceptions have been reported - 11,156,313,503 cycles # 2.736 GHz - 19,267,334,271 instructions # 1.73 insn per cycle - 4.078862770 seconds time elapsed + 11,185,517,102 cycles # 2.709 GHz + 19,267,834,957 instructions # 1.72 insn per cycle + 4.130263002 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.001009e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.815270e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.815270e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.993280e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.807930e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.807930e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.939832 sec +TOTAL : 3.962775 sec INFO: No Floating Point Exceptions have been reported - 10,815,664,939 cycles # 2.742 GHz - 18,691,798,772 instructions # 1.73 insn per cycle - 3.945208768 seconds time elapsed + 10,857,852,651 cycles # 2.737 GHz + 18,688,313,206 instructions # 1.72 insn per cycle + 3.968277953 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.659550e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.188571e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.188571e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.669707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.199425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.199425e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.600299 sec +TOTAL : 4.577580 sec INFO: No Floating Point Exceptions have been reported - 9,711,022,403 cycles # 2.112 GHz - 15,432,876,214 instructions # 1.59 insn per cycle - 4.605929662 seconds time elapsed + 9,715,691,578 cycles # 2.121 GHz + 15,431,480,999 instructions # 1.59 insn per cycle + 4.583151536 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 55f8e65d60..cd3c8cd8c3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_15:08:14 +DATE: 2024-06-02_21:34:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.528786e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.598045e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.123971e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.026327e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.798452e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.142109e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.009724 sec +TOTAL : 0.998322 sec INFO: No Floating Point Exceptions have been reported - 3,524,520,771 cycles # 2.832 GHz - 6,986,852,382 instructions # 1.98 insn per cycle - 1.303213412 seconds time elapsed + 3,480,084,828 cycles # 2.833 GHz + 7,034,366,070 instructions # 2.02 insn per cycle + 1.285315972 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.055666e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.238246e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.238246e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051590e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.235793e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.235793e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.381413 sec +TOTAL : 6.406221 sec INFO: No Floating Point Exceptions have been reported - 18,267,573,257 cycles # 2.861 GHz - 43,966,026,516 instructions # 2.41 insn per cycle - 6.386884750 seconds time elapsed + 18,351,317,436 cycles # 2.863 GHz + 43,970,738,136 instructions # 2.40 insn per cycle + 6.411801951 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.556382e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.037173e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.037173e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.546913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.022660e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.022660e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.468246 sec +TOTAL : 4.494676 sec INFO: No Floating Point Exceptions have been reported - 12,780,247,489 cycles # 2.858 GHz - 30,998,946,765 instructions # 2.43 insn per cycle - 4.473714210 seconds time elapsed + 12,818,589,929 cycles # 2.849 GHz + 31,001,594,318 instructions # 2.42 insn per cycle + 4.500171814 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.920501e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.673029e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.673029e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.924966e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.685276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.685276e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.698478 sec +TOTAL : 3.690851 sec INFO: No Floating Point Exceptions have been reported - 10,077,708,073 cycles # 2.723 GHz - 19,366,955,499 instructions # 1.92 insn per cycle - 3.704491612 seconds time elapsed + 10,087,697,509 cycles # 2.730 GHz + 19,365,345,065 instructions # 1.92 insn per cycle + 3.696395456 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.006263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.826924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.826924e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.949695e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.742629e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.742629e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.554344 sec +TOTAL : 3.652825 sec INFO: No Floating Point Exceptions have been reported - 9,708,097,650 cycles # 2.728 GHz - 18,987,540,468 instructions # 1.96 insn per cycle - 3.559725957 seconds time elapsed + 9,810,916,380 cycles # 2.684 GHz + 18,988,601,654 instructions # 1.94 insn per cycle + 3.658397686 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.673906e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201263e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201263e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.670977e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.205176e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.205176e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.180852 sec +TOTAL : 4.189326 sec INFO: No Floating Point Exceptions have been reported - 8,572,399,884 cycles # 2.048 GHz - 15,727,509,673 instructions # 1.83 insn per cycle - 4.186463799 seconds time elapsed + 8,618,980,631 cycles # 2.055 GHz + 15,727,806,217 instructions # 1.82 insn per cycle + 4.194947819 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 8320028620..3d612f0f8f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_15:05:29 +DATE: 2024-06-02_21:31:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.845001e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.545626e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.012755e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.966609e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.692288e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.029334e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.945484 sec +TOTAL : 1.920797 sec INFO: No Floating Point Exceptions have been reported - 6,182,952,030 cycles # 2.840 GHz - 11,472,065,600 instructions # 1.86 insn per cycle - 2.234065267 seconds time elapsed + 6,106,858,009 cycles # 2.839 GHz + 11,364,477,574 instructions # 1.86 insn per cycle + 2.208478557 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -83,15 +83,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.234940e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.234940e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.046138e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.230727e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.230727e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.394281 sec +TOTAL : 6.439007 sec INFO: No Floating Point Exceptions have been reported - 18,294,474,856 cycles # 2.859 GHz - 43,971,000,114 instructions # 2.40 insn per cycle - 6.399562206 seconds time elapsed + 18,389,458,277 cycles # 2.854 GHz + 43,970,512,226 instructions # 2.39 insn per cycle + 6.444728897 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.557791e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.040989e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.040989e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.536098e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.017790e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.017790e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.463787 sec +TOTAL : 4.525302 sec INFO: No Floating Point Exceptions have been reported - 12,772,778,524 cycles # 2.859 GHz - 30,998,712,334 instructions # 2.43 insn per cycle - 4.469196075 seconds time elapsed + 12,893,450,701 cycles # 2.847 GHz + 31,000,830,473 instructions # 2.40 insn per cycle + 4.530866155 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -139,15 +139,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.925684e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.680642e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.680642e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.928619e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689762e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.688587 sec +TOTAL : 3.685051 sec INFO: No Floating Point Exceptions have been reported - 10,072,112,495 cycles # 2.727 GHz - 19,365,616,714 instructions # 1.92 insn per cycle - 3.694022814 seconds time elapsed + 10,071,705,419 cycles # 2.730 GHz + 19,365,099,946 instructions # 1.92 insn per cycle + 3.690595449 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.999182e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.825411e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.825411e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.001165e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.830714e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.830714e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.565445 sec +TOTAL : 3.567461 sec INFO: No Floating Point Exceptions have been reported - 9,737,532,909 cycles # 2.728 GHz - 18,976,607,709 instructions # 1.95 insn per cycle - 3.570830090 seconds time elapsed + 9,752,859,688 cycles # 2.730 GHz + 18,976,384,316 instructions # 1.95 insn per cycle + 3.573110488 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -195,15 +195,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.671566e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.197159e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.197159e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.669903e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201435e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201435e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.185653 sec +TOTAL : 4.193613 sec INFO: No Floating Point Exceptions have been reported - 8,570,262,444 cycles # 2.045 GHz - 15,727,819,138 instructions # 1.84 insn per cycle - 4.191109662 seconds time elapsed + 8,620,136,128 cycles # 2.053 GHz + 15,727,513,221 instructions # 1.82 insn per cycle + 4.199142686 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 6aa3de3ecf..3617e224c6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:32:42 +DATE: 2024-06-02_20:49:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.832072e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.951586e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.177922e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.362484e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.520610e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.206124e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.666506 sec +TOTAL : 0.676929 sec INFO: No Floating Point Exceptions have been reported - 2,570,661,306 cycles # 2.841 GHz - 3,994,547,928 instructions # 1.55 insn per cycle - 0.967516454 seconds time elapsed + 2,569,510,705 cycles # 2.820 GHz + 4,003,501,539 instructions # 1.56 insn per cycle + 0.973315338 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.115974e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.322257e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.322257e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.103860e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.307114e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.307114e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.054676 sec +TOTAL : 6.122439 sec INFO: No Floating Point Exceptions have been reported - 17,515,565,744 cycles # 2.891 GHz - 41,813,477,100 instructions # 2.39 insn per cycle - 6.059803806 seconds time elapsed + 17,532,692,730 cycles # 2.862 GHz + 41,814,035,675 instructions # 2.38 insn per cycle + 6.128383150 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.620797e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.138518e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.138518e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.581686e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.085119e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.085119e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.301844 sec +TOTAL : 4.404931 sec INFO: No Floating Point Exceptions have been reported - 12,450,766,554 cycles # 2.891 GHz - 30,161,114,565 instructions # 2.42 insn per cycle - 4.307292943 seconds time elapsed + 12,515,101,521 cycles # 2.838 GHz + 30,161,142,578 instructions # 2.41 insn per cycle + 4.410559397 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.953305e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.731201e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.731201e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.946029e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.718574e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.718574e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.641578 sec +TOTAL : 3.653893 sec INFO: No Floating Point Exceptions have been reported - 9,958,194,708 cycles # 2.732 GHz - 19,097,340,022 instructions # 1.92 insn per cycle - 3.646748223 seconds time elapsed + 9,961,431,996 cycles # 2.723 GHz + 19,096,639,277 instructions # 1.92 insn per cycle + 3.659578231 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.030144e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.871070e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.871070e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.016173e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.848698e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.848698e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.516336 sec +TOTAL : 3.542404 sec INFO: No Floating Point Exceptions have been reported - 9,615,329,857 cycles # 2.731 GHz - 18,757,197,169 instructions # 1.95 insn per cycle - 3.521486960 seconds time elapsed + 9,660,599,362 cycles # 2.725 GHz + 18,744,004,297 instructions # 1.94 insn per cycle + 3.547571851 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.716452e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.276447e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.276447e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.727713e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.296045e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.296045e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.085042 sec +TOTAL : 4.062176 sec INFO: No Floating Point Exceptions have been reported - 8,419,965,935 cycles # 2.059 GHz - 15,604,092,420 instructions # 1.85 insn per cycle - 4.090208290 seconds time elapsed + 8,450,337,585 cycles # 2.078 GHz + 15,603,422,783 instructions # 1.85 insn per cycle + 4.067782201 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index b8b45776b1..501b51f71f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:50:17 +DATE: 2024-06-02_21:16:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.702175e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.710921e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.156854e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.738504e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.732597e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.167044e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.694390 sec +TOTAL : 0.684793 sec INFO: No Floating Point Exceptions have been reported - 2,605,646,002 cycles # 2.805 GHz - 4,054,878,448 instructions # 1.56 insn per cycle - 0.990293563 seconds time elapsed + 2,601,107,988 cycles # 2.822 GHz + 4,061,282,635 instructions # 1.56 insn per cycle + 0.978559950 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.570616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.013848e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.013848e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.575213e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.020448e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.020448e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.427060 sec +TOTAL : 4.416526 sec INFO: No Floating Point Exceptions have been reported - 12,653,335,495 cycles # 2.855 GHz - 32,508,582,789 instructions # 2.57 insn per cycle - 4.432506316 seconds time elapsed + 12,654,142,005 cycles # 2.862 GHz + 32,510,363,434 instructions # 2.57 insn per cycle + 4.422251656 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.997942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.867798e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.867798e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.002255e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.874219e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.874219e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.572733 sec +TOTAL : 3.568265 sec INFO: No Floating Point Exceptions have been reported - 10,224,022,422 cycles # 2.858 GHz - 24,474,305,392 instructions # 2.39 insn per cycle - 3.578147466 seconds time elapsed + 10,224,593,553 cycles # 2.863 GHz + 24,472,095,992 instructions # 2.39 insn per cycle + 3.573538181 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.166030e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.174429e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.174429e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.169162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.169863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.169863e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.325864 sec +TOTAL : 3.324119 sec INFO: No Floating Point Exceptions have been reported - 9,098,194,590 cycles # 2.732 GHz - 16,922,780,551 instructions # 1.86 insn per cycle - 3.331363940 seconds time elapsed + 9,111,176,688 cycles # 2.737 GHz + 16,922,082,397 instructions # 1.86 insn per cycle + 3.329746327 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.212975e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.265291e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.265291e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.220354e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.270284e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.270284e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.263243 sec +TOTAL : 3.254253 sec INFO: No Floating Point Exceptions have been reported - 8,899,581,855 cycles # 2.723 GHz - 16,332,700,862 instructions # 1.84 insn per cycle - 3.268811314 seconds time elapsed + 8,910,060,786 cycles # 2.734 GHz + 16,345,046,075 instructions # 1.83 insn per cycle + 3.260025356 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.881513e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.573087e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.573087e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.878740e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.571241e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.571241e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.766130 sec +TOTAL : 3.773398 sec INFO: No Floating Point Exceptions have been reported - 7,870,004,063 cycles # 2.087 GHz - 14,582,523,760 instructions # 1.85 insn per cycle - 3.771527980 seconds time elapsed + 7,901,326,876 cycles # 2.092 GHz + 14,582,511,484 instructions # 1.85 insn per cycle + 3.778605571 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 36ca3a055a..fa73177cd7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:50:44 +DATE: 2024-06-02_21:17:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.703728e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.728666e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.213805e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.732503e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.754634e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.215471e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.683308 sec +TOTAL : 0.688215 sec INFO: No Floating Point Exceptions have been reported - 2,600,218,993 cycles # 2.827 GHz - 4,020,842,023 instructions # 1.55 insn per cycle - 0.979103636 seconds time elapsed + 2,639,635,060 cycles # 2.820 GHz + 4,015,174,984 instructions # 1.52 insn per cycle + 0.994462896 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.089739e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.950650e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.950650e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.083631e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.949296e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.949296e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.429960 sec +TOTAL : 3.441397 sec INFO: No Floating Point Exceptions have been reported - 9,811,818,087 cycles # 2.857 GHz - 25,388,363,151 instructions # 2.59 insn per cycle - 3.435408237 seconds time elapsed + 9,846,812,518 cycles # 2.858 GHz + 25,386,191,431 instructions # 2.58 insn per cycle + 3.446476615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.308707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576242e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576242e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.316581e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557276e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.150608 sec +TOTAL : 3.142273 sec INFO: No Floating Point Exceptions have been reported - 9,017,040,298 cycles # 2.858 GHz - 21,483,572,468 instructions # 2.38 insn per cycle - 3.156151233 seconds time elapsed + 8,991,441,142 cycles # 2.857 GHz + 21,484,440,131 instructions # 2.39 insn per cycle + 3.147929478 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.317514e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502934e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.502934e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.320671e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.503530e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.503530e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.137476 sec +TOTAL : 3.133297 sec INFO: No Floating Point Exceptions have been reported - 8,591,075,632 cycles # 2.734 GHz - 15,811,134,800 instructions # 1.84 insn per cycle - 3.143116597 seconds time elapsed + 8,580,721,113 cycles # 2.735 GHz + 15,811,719,082 instructions # 1.84 insn per cycle + 3.138961399 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.364262e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.604788e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.604788e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.375766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.617874e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617874e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.081738 sec +TOTAL : 3.069315 sec INFO: No Floating Point Exceptions have been reported - 8,442,051,612 cycles # 2.735 GHz - 15,504,513,991 instructions # 1.84 insn per cycle - 3.087247738 seconds time elapsed + 8,463,481,626 cycles # 2.754 GHz + 15,513,175,556 instructions # 1.83 insn per cycle + 3.074609069 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.995819e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.803385e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.803385e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.011258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.824275e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.824275e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.575599 sec +TOTAL : 3.549017 sec INFO: No Floating Point Exceptions have been reported - 7,560,717,738 cycles # 2.112 GHz - 14,283,918,013 instructions # 1.89 insn per cycle - 3.581217674 seconds time elapsed + 7,565,498,334 cycles # 2.129 GHz + 14,283,366,137 instructions # 1.89 insn per cycle + 3.554588261 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index e6a48e18ea..78b8b832b6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:33:12 +DATE: 2024-06-02_20:49:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.602023e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319974e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.288580e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.453404e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.301856e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.286518e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.573413 sec +TOTAL : 0.580838 sec INFO: No Floating Point Exceptions have been reported - 2,248,961,780 cycles # 2.823 GHz - 3,510,545,687 instructions # 1.56 insn per cycle - 0.854969378 seconds time elapsed + 2,286,209,161 cycles # 2.821 GHz + 3,532,764,689 instructions # 1.55 insn per cycle + 0.869255178 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.082006e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.283429e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.283429e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.078555e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.280307e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.280307e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.191643 sec +TOTAL : 6.210359 sec INFO: No Floating Point Exceptions have been reported - 17,740,084,916 cycles # 2.864 GHz - 43,510,870,904 instructions # 2.45 insn per cycle - 6.196630799 seconds time elapsed + 17,783,383,753 cycles # 2.862 GHz + 43,511,171,857 instructions # 2.45 insn per cycle + 6.215902116 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.223628e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.400190e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.400190e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.210434e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.377494e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.377494e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.211334 sec +TOTAL : 3.231501 sec INFO: No Floating Point Exceptions have been reported - 9,244,093,548 cycles # 2.875 GHz - 21,907,620,538 instructions # 2.37 insn per cycle - 3.216400901 seconds time elapsed + 9,255,830,965 cycles # 2.863 GHz + 21,906,871,719 instructions # 2.37 insn per cycle + 3.236386372 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.419643e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.716778e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.716778e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.371336e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.610412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.610412e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.967957 sec +TOTAL : 3.030129 sec INFO: No Floating Point Exceptions have been reported - 8,316,472,651 cycles # 2.798 GHz - 15,592,546,873 instructions # 1.87 insn per cycle - 2.972957823 seconds time elapsed + 8,294,048,623 cycles # 2.733 GHz + 15,590,527,403 instructions # 1.88 insn per cycle + 3.035436377 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.415944e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.716005e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.716005e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.397036e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.670362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.670362e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.973842 sec +TOTAL : 3.001269 sec INFO: No Floating Point Exceptions have been reported - 8,315,082,395 cycles # 2.793 GHz - 15,436,266,122 instructions # 1.86 insn per cycle - 2.978955673 seconds time elapsed + 8,226,933,402 cycles # 2.737 GHz + 15,430,117,600 instructions # 1.88 insn per cycle + 3.006462525 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.416813e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.677411e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.677411e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.353627e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576193e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576193e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.974949 sec +TOTAL : 3.054034 sec INFO: No Floating Point Exceptions have been reported - 6,610,937,423 cycles # 2.219 GHz - 12,863,752,208 instructions # 1.95 insn per cycle - 2.980091060 seconds time elapsed + 6,654,880,184 cycles # 2.176 GHz + 12,863,187,093 instructions # 1.93 insn per cycle + 3.059348788 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 29c604a610..65b53d740f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_15:00:26 +DATE: 2024-06-02_21:26:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.941162e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.420865e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.420865e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.060712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.025766e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.025766e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.745821 sec +TOTAL : 1.728605 sec INFO: No Floating Point Exceptions have been reported - 5,608,708,868 cycles # 2.838 GHz - 10,190,752,473 instructions # 1.82 insn per cycle - 2.033892232 seconds time elapsed + 5,559,492,246 cycles # 2.840 GHz + 10,134,973,470 instructions # 1.82 insn per cycle + 2.015457056 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,15 +90,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.055357e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.251163e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.251163e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.061003e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.254218e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.254218e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.442554 sec +TOTAL : 6.420879 sec INFO: No Floating Point Exceptions have been reported - 18,431,262,224 cycles # 2.859 GHz - 43,659,496,470 instructions # 2.37 insn per cycle - 6.448903506 seconds time elapsed + 18,399,874,886 cycles # 2.863 GHz + 43,656,453,581 instructions # 2.37 insn per cycle + 6.427487997 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,15 +119,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.110619e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.160777e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.160777e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.114817e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.166060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.166060e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.492633 sec +TOTAL : 3.495712 sec INFO: No Floating Point Exceptions have been reported - 9,993,500,583 cycles # 2.857 GHz - 23,243,476,984 instructions # 2.33 insn per cycle - 3.498991107 seconds time elapsed + 10,015,137,457 cycles # 2.860 GHz + 23,241,753,742 instructions # 2.32 insn per cycle + 3.502264923 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -148,15 +148,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.275264e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.392372e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.392372e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.231996e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.358012e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.358012e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.272099 sec +TOTAL : 3.334222 sec INFO: No Floating Point Exceptions have been reported - 9,005,707,266 cycles # 2.748 GHz - 16,711,349,389 instructions # 1.86 insn per cycle - 3.278668519 seconds time elapsed + 9,138,089,758 cycles # 2.736 GHz + 16,713,258,734 instructions # 1.83 insn per cycle + 3.340848954 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -177,15 +177,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.297831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.443567e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.443567e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.270258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.427950e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.427950e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.244587 sec +TOTAL : 3.281372 sec INFO: No Floating Point Exceptions have been reported - 8,928,752,660 cycles # 2.747 GHz - 16,549,135,089 instructions # 1.85 insn per cycle - 3.250993607 seconds time elapsed + 9,005,942,306 cycles # 2.740 GHz + 16,548,921,552 instructions # 1.84 insn per cycle + 3.287811990 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -206,15 +206,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.241559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.306249e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.306249e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.240131e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.309974e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.309974e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.324488 sec +TOTAL : 3.325465 sec INFO: No Floating Point Exceptions have been reported - 7,378,511,382 cycles # 2.216 GHz - 14,071,008,703 instructions # 1.91 insn per cycle - 3.330848983 seconds time elapsed + 7,397,809,520 cycles # 2.221 GHz + 14,072,596,030 instructions # 1.90 insn per cycle + 3.332157901 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 8016aaf3c8..41fcdf2cfe 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_15:11:30 +DATE: 2024-06-02_21:37:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.314779e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.179276e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254245e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.389352e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.220547e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.248789e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.213916 sec +TOTAL : 1.212858 sec INFO: No Floating Point Exceptions have been reported - 4,089,914,869 cycles # 2.847 GHz - 6,594,462,327 instructions # 1.61 insn per cycle - 1.494122889 seconds time elapsed + 4,093,911,360 cycles # 2.853 GHz + 6,566,528,457 instructions # 1.60 insn per cycle + 1.491492030 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.089350e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.292539e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.292539e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.083434e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.285638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.285638e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.490729 sec +TOTAL : 6.532719 sec INFO: No Floating Point Exceptions have been reported - 18,740,706,935 cycles # 2.886 GHz - 43,689,321,367 instructions # 2.33 insn per cycle - 6.495941000 seconds time elapsed + 18,783,324,742 cycles # 2.874 GHz + 43,693,376,231 instructions # 2.33 insn per cycle + 6.537953932 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.213519e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.407340e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.407340e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.210512e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.382085e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.382085e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.565494 sec +TOTAL : 3.582695 sec INFO: No Floating Point Exceptions have been reported - 10,288,737,724 cycles # 2.883 GHz - 21,988,558,280 instructions # 2.14 insn per cycle - 3.570732391 seconds time elapsed + 10,261,732,783 cycles # 2.861 GHz + 21,990,872,924 instructions # 2.14 insn per cycle + 3.587933515 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.392103e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.656361e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.656361e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.363828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.635295e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.635295e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.347045 sec +TOTAL : 3.389987 sec INFO: No Floating Point Exceptions have been reported - 9,294,224,919 cycles # 2.774 GHz - 15,502,535,760 instructions # 1.67 insn per cycle - 3.352354405 seconds time elapsed + 9,345,873,446 cycles # 2.754 GHz + 15,502,334,673 instructions # 1.66 insn per cycle + 3.395365367 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.411853e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.706271e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.706271e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.383815e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.695694e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695694e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.334245 sec +TOTAL : 3.376377 sec INFO: No Floating Point Exceptions have been reported - 9,229,877,586 cycles # 2.765 GHz - 15,144,508,612 instructions # 1.64 insn per cycle - 3.339505215 seconds time elapsed + 9,309,965,265 cycles # 2.754 GHz + 15,139,174,417 instructions # 1.63 insn per cycle + 3.381718320 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.389642e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623022e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623022e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.370232e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594168e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594168e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.358884 sec +TOTAL : 3.394429 sec INFO: No Floating Point Exceptions have been reported - 7,623,474,420 cycles # 2.266 GHz - 12,573,351,599 instructions # 1.65 insn per cycle - 3.364654068 seconds time elapsed + 7,646,568,006 cycles # 2.250 GHz + 12,573,843,987 instructions # 1.64 insn per cycle + 3.399861496 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 3bd2ee01ac..cbd445fde8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_15:08:45 +DATE: 2024-06-02_21:35:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.323461e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.185145e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.269757e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.400821e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237112e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.282800e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.877821 sec +TOTAL : 0.873575 sec INFO: No Floating Point Exceptions have been reported - 3,113,911,295 cycles # 2.829 GHz - 6,352,740,713 instructions # 2.04 insn per cycle - 1.157340966 seconds time elapsed + 3,103,401,533 cycles # 2.833 GHz + 6,356,049,584 instructions # 2.05 insn per cycle + 1.152258826 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.080933e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281840e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281840e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.080565e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.282306e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.282306e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.198018 sec +TOTAL : 6.200319 sec INFO: No Floating Point Exceptions have been reported - 17,744,200,571 cycles # 2.861 GHz - 43,507,633,337 instructions # 2.45 insn per cycle - 6.203254296 seconds time elapsed + 17,766,785,300 cycles # 2.864 GHz + 43,507,689,985 instructions # 2.45 insn per cycle + 6.205548449 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.206413e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.379725e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.379725e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.219205e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.391133e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.391133e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.236598 sec +TOTAL : 3.222734 sec INFO: No Floating Point Exceptions have been reported - 9,264,626,353 cycles # 2.859 GHz - 21,909,129,569 instructions # 2.36 insn per cycle - 3.241826343 seconds time elapsed + 9,245,511,720 cycles # 2.865 GHz + 21,907,133,008 instructions # 2.37 insn per cycle + 3.228067776 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.378945e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622195e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622195e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.362472e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.627969e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.627969e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.023073 sec +TOTAL : 3.040708 sec INFO: No Floating Point Exceptions have been reported - 8,285,944,372 cycles # 2.737 GHz - 15,591,046,995 instructions # 1.88 insn per cycle - 3.028351636 seconds time elapsed + 8,344,967,895 cycles # 2.740 GHz + 15,591,025,923 instructions # 1.87 insn per cycle + 3.046295562 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.406944e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.689990e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.689990e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.379116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.690003e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.690003e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.991499 sec +TOTAL : 3.024127 sec INFO: No Floating Point Exceptions have been reported - 8,215,974,247 cycles # 2.742 GHz - 15,434,394,808 instructions # 1.88 insn per cycle - 2.996736921 seconds time elapsed + 8,310,000,031 cycles # 2.744 GHz + 15,436,072,136 instructions # 1.86 insn per cycle + 3.029766146 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.365448e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.582442e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.582442e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.361982e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.586100e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.586100e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.039216 sec +TOTAL : 3.044604 sec INFO: No Floating Point Exceptions have been reported - 6,609,003,865 cycles # 2.172 GHz - 12,863,939,056 instructions # 1.95 insn per cycle - 3.044504803 seconds time elapsed + 6,641,930,192 cycles # 2.179 GHz + 12,864,124,768 instructions # 1.94 insn per cycle + 3.049948851 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index ef6806658f..275da8993d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_15:06:01 +DATE: 2024-06-02_21:32:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.717298e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.145539e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.143407e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.808283e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.206237e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.142753e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.533112 sec +TOTAL : 1.519463 sec INFO: No Floating Point Exceptions have been reported - 4,980,418,158 cycles # 2.839 GHz - 9,119,342,139 instructions # 1.83 insn per cycle - 1.812784805 seconds time elapsed + 4,951,290,576 cycles # 2.843 GHz + 9,146,689,840 instructions # 1.85 insn per cycle + 1.798020957 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -83,15 +83,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069115e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.265540e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.265540e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.076317e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.281294e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281294e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.262605 sec +TOTAL : 6.297116 sec INFO: No Floating Point Exceptions have been reported - 17,921,464,120 cycles # 2.860 GHz - 43,508,155,770 instructions # 2.43 insn per cycle - 6.267871711 seconds time elapsed + 18,034,882,237 cycles # 2.862 GHz + 43,508,302,495 instructions # 2.41 insn per cycle + 6.302484278 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.205993e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.384022e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.384022e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.210720e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.378151e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.378151e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.237394 sec +TOTAL : 3.233511 sec INFO: No Floating Point Exceptions have been reported - 9,270,351,681 cycles # 2.860 GHz - 21,907,147,046 instructions # 2.36 insn per cycle - 3.242634203 seconds time elapsed + 9,271,850,153 cycles # 2.864 GHz + 21,907,043,465 instructions # 2.36 insn per cycle + 3.238909660 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -139,15 +139,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379256e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.625966e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.625966e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.352851e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.608306e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.608306e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.019248 sec +TOTAL : 3.052167 sec INFO: No Floating Point Exceptions have been reported - 8,297,873,717 cycles # 2.744 GHz - 15,590,905,283 instructions # 1.88 insn per cycle - 3.024598202 seconds time elapsed + 8,355,651,202 cycles # 2.734 GHz + 15,591,192,622 instructions # 1.87 insn per cycle + 3.057529422 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.398987e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.671948e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.671948e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.379040e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.675709e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.675709e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.000824 sec +TOTAL : 3.022810 sec INFO: No Floating Point Exceptions have been reported - 8,223,361,227 cycles # 2.736 GHz - 15,434,590,756 instructions # 1.88 insn per cycle - 3.006023707 seconds time elapsed + 8,287,014,121 cycles # 2.737 GHz + 15,428,840,508 instructions # 1.86 insn per cycle + 3.028205243 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -195,15 +195,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.360241e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570739e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570739e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.364932e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.586134e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.586134e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.046199 sec +TOTAL : 3.042115 sec INFO: No Floating Point Exceptions have been reported - 6,615,724,908 cycles # 2.169 GHz - 12,863,710,849 instructions # 1.94 insn per cycle - 3.051492012 seconds time elapsed + 6,626,503,704 cycles # 2.175 GHz + 12,863,711,552 instructions # 1.94 insn per cycle + 3.047484573 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index b613786442..3a0fd0a90a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:33:39 +DATE: 2024-06-02_20:50:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.604585e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.336536e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.343718e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.463816e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.296908e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.328725e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.571057 sec +TOTAL : 0.577578 sec INFO: No Floating Point Exceptions have been reported - 2,242,744,669 cycles # 2.822 GHz - 3,531,920,926 instructions # 1.57 insn per cycle - 0.851832101 seconds time elapsed + 2,276,498,493 cycles # 2.822 GHz + 3,528,632,890 instructions # 1.55 insn per cycle + 0.863625263 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.153085e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.384746e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.384746e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.154961e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387398e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.830325 sec +TOTAL : 5.823373 sec INFO: No Floating Point Exceptions have been reported - 16,691,813,815 cycles # 2.861 GHz - 41,266,181,474 instructions # 2.47 insn per cycle - 5.835359179 seconds time elapsed + 16,689,004,614 cycles # 2.865 GHz + 41,263,252,653 instructions # 2.47 insn per cycle + 5.828275571 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.304367e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.587722e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.587722e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.271492e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.527103e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.527103e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.109672 sec +TOTAL : 3.154514 sec INFO: No Floating Point Exceptions have been reported - 8,995,426,679 cycles # 2.889 GHz - 21,211,089,826 instructions # 2.36 insn per cycle - 3.114839321 seconds time elapsed + 9,027,063,562 cycles # 2.858 GHz + 21,210,233,128 instructions # 2.35 insn per cycle + 3.159850522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.420820e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.716893e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.716893e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.387654e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.648132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.648132e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.969606 sec +TOTAL : 3.011648 sec INFO: No Floating Point Exceptions have been reported - 8,272,952,138 cycles # 2.782 GHz - 15,425,102,157 instructions # 1.86 insn per cycle - 2.974640700 seconds time elapsed + 8,243,811,729 cycles # 2.736 GHz + 15,422,236,523 instructions # 1.87 insn per cycle + 3.016578474 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.475869e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.854556e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.854556e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.441553e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.770792e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.770792e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.912175 sec +TOTAL : 2.953944 sec INFO: No Floating Point Exceptions have been reported - 8,117,590,540 cycles # 2.783 GHz - 15,233,342,033 instructions # 1.88 insn per cycle - 2.917189383 seconds time elapsed + 8,107,162,693 cycles # 2.740 GHz + 15,232,791,801 instructions # 1.88 insn per cycle + 2.959262610 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.412788e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.668874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.668874e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.373912e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.600116e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600116e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.982256 sec +TOTAL : 3.031130 sec INFO: No Floating Point Exceptions have been reported - 6,592,409,084 cycles # 2.208 GHz - 12,843,659,599 instructions # 1.95 insn per cycle - 2.987368722 seconds time elapsed + 6,600,460,683 cycles # 2.175 GHz + 12,841,921,234 instructions # 1.95 insn per cycle + 3.036614829 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index e6d46e5965..9c3ce37c8b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:51:09 +DATE: 2024-06-02_21:17:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.307514e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195447e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.293637e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.329688e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197353e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.294513e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.586214 sec +TOTAL : 0.584260 sec INFO: No Floating Point Exceptions have been reported - 2,288,759,129 cycles # 2.822 GHz - 3,563,945,826 instructions # 1.56 insn per cycle - 0.869586754 seconds time elapsed + 2,290,837,964 cycles # 2.826 GHz + 3,567,143,300 instructions # 1.56 insn per cycle + 0.867513946 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.594420e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.075669e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.075669e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.595326e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079631e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079631e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.325231 sec +TOTAL : 4.323662 sec INFO: No Floating Point Exceptions have been reported - 12,164,411,956 cycles # 2.810 GHz - 32,427,707,417 instructions # 2.67 insn per cycle - 4.330470336 seconds time elapsed + 12,200,237,668 cycles # 2.819 GHz + 32,427,514,864 instructions # 2.66 insn per cycle + 4.329187293 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.607791e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.429113e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.429113e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.615965e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.427412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.427412e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.796823 sec +TOTAL : 2.793932 sec INFO: No Floating Point Exceptions have been reported - 8,009,111,157 cycles # 2.859 GHz - 18,657,618,704 instructions # 2.33 insn per cycle - 2.802139139 seconds time elapsed + 7,996,574,580 cycles # 2.860 GHz + 18,655,165,559 instructions # 2.33 insn per cycle + 2.799028174 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.719759e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.472647e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.472647e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.687284e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.399964e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.399964e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.690741 sec +TOTAL : 2.721615 sec INFO: No Floating Point Exceptions have been reported - 7,416,864,109 cycles # 2.752 GHz - 14,251,974,045 instructions # 1.92 insn per cycle - 2.696083346 seconds time elapsed + 7,454,408,451 cycles # 2.735 GHz + 14,253,415,404 instructions # 1.91 insn per cycle + 2.727046264 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.774877e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.630440e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.630440e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.770821e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.626363e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.626363e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.645076 sec +TOTAL : 2.650953 sec INFO: No Floating Point Exceptions have been reported - 7,291,130,406 cycles # 2.752 GHz - 13,948,384,567 instructions # 1.91 insn per cycle - 2.650598467 seconds time elapsed + 7,318,335,472 cycles # 2.756 GHz + 13,948,037,827 instructions # 1.91 insn per cycle + 2.656532072 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.434115e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.741003e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.741003e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.428325e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.729451e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.729451e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.962177 sec +TOTAL : 2.970494 sec INFO: No Floating Point Exceptions have been reported - 6,479,327,720 cycles # 2.184 GHz - 13,423,401,797 instructions # 2.07 insn per cycle - 2.967420151 seconds time elapsed + 6,503,944,976 cycles # 2.187 GHz + 13,423,073,698 instructions # 2.06 insn per cycle + 2.975897923 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 1851f3246c..76b55ad2e4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:51:33 +DATE: 2024-06-02_21:17:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.308899e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200904e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.329787e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.324695e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.210479e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.333686e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.585570 sec +TOTAL : 0.587402 sec INFO: No Floating Point Exceptions have been reported - 2,293,480,451 cycles # 2.820 GHz - 3,552,176,680 instructions # 1.55 insn per cycle - 0.870100804 seconds time elapsed + 2,292,933,061 cycles # 2.822 GHz + 3,530,135,889 instructions # 1.54 insn per cycle + 0.871362762 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.129885e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.080551e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.080551e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.106790e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.056635e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.056635e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.330346 sec +TOTAL : 3.364784 sec INFO: No Floating Point Exceptions have been reported - 9,425,530,261 cycles # 2.826 GHz - 25,263,309,757 instructions # 2.68 insn per cycle - 3.335509619 seconds time elapsed + 9,485,686,184 cycles # 2.815 GHz + 25,263,356,042 instructions # 2.66 insn per cycle + 3.370276038 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.953227e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.493970e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.493970e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.961447e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.505102e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.505102e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.514743 sec +TOTAL : 2.509947 sec INFO: No Floating Point Exceptions have been reported - 7,199,101,915 cycles # 2.858 GHz - 16,870,111,415 instructions # 2.34 insn per cycle - 2.520226033 seconds time elapsed + 7,195,839,812 cycles # 2.862 GHz + 16,868,387,762 instructions # 2.34 insn per cycle + 2.515387369 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.869871e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.903620e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.903620e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.832943e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.808929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.808929e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.568153 sec +TOTAL : 2.598583 sec INFO: No Floating Point Exceptions have been reported - 7,089,400,745 cycles # 2.756 GHz - 13,616,924,187 instructions # 1.92 insn per cycle - 2.573571442 seconds time elapsed + 7,142,287,207 cycles # 2.744 GHz + 13,617,950,967 instructions # 1.91 insn per cycle + 2.604113274 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.911116e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.034757e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.034757e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.905826e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.012392e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.012392e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.534380 sec +TOTAL : 2.542873 sec INFO: No Floating Point Exceptions have been reported - 7,042,060,221 cycles # 2.774 GHz - 13,426,671,587 instructions # 1.91 insn per cycle - 2.539847169 seconds time elapsed + 7,030,524,595 cycles # 2.760 GHz + 13,426,027,213 instructions # 1.91 insn per cycle + 2.548245628 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.525985e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967300e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.967300e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.533501e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.984604e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.984604e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.871748 sec +TOTAL : 2.864382 sec INFO: No Floating Point Exceptions have been reported - 6,325,625,286 cycles # 2.199 GHz - 13,154,721,049 instructions # 2.08 insn per cycle - 2.877120825 seconds time elapsed + 6,329,632,912 cycles # 2.206 GHz + 13,154,745,067 instructions # 2.08 insn per cycle + 2.870076647 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index b626a014f8..30bc197182 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:34:05 +DATE: 2024-06-02_20:50:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.830025e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.944832e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.160865e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.732879e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.705839e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.157350e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.662041 sec +TOTAL : 0.682384 sec INFO: No Floating Point Exceptions have been reported - 2,559,219,510 cycles # 2.861 GHz - 3,969,506,530 instructions # 1.55 insn per cycle - 0.952802853 seconds time elapsed + 2,597,825,046 cycles # 2.830 GHz + 4,090,205,188 instructions # 1.57 insn per cycle + 0.977759023 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.044346e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.221236e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.221236e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.031288e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.206220e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.206220e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.439706 sec +TOTAL : 6.522879 sec INFO: No Floating Point Exceptions have been reported - 18,648,827,254 cycles # 2.894 GHz - 44,218,351,924 instructions # 2.37 insn per cycle - 6.444755062 seconds time elapsed + 18,693,180,609 cycles # 2.864 GHz + 44,222,141,009 instructions # 2.37 insn per cycle + 6.528296312 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634240e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.158489e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.158489e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.613969e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.133791e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.133791e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.270020 sec +TOTAL : 4.323747 sec INFO: No Floating Point Exceptions have been reported - 12,337,216,169 cycles # 2.886 GHz - 30,918,100,190 instructions # 2.51 insn per cycle - 4.275170664 seconds time elapsed + 12,389,439,717 cycles # 2.863 GHz + 30,920,154,197 instructions # 2.50 insn per cycle + 4.329261731 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.943703e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.696046e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.696046e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.921394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.668547e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.668547e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.651391 sec +TOTAL : 3.698108 sec INFO: No Floating Point Exceptions have been reported - 10,097,284,751 cycles # 2.762 GHz - 19,374,074,587 instructions # 1.92 insn per cycle - 3.656592402 seconds time elapsed + 10,087,612,198 cycles # 2.725 GHz + 19,373,367,445 instructions # 1.92 insn per cycle + 3.703569148 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.039225e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.880994e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.880994e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.984824e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.800128e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.800128e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.498933 sec +TOTAL : 3.590549 sec INFO: No Floating Point Exceptions have been reported - 9,699,890,764 cycles # 2.769 GHz - 18,944,296,026 instructions # 1.95 insn per cycle - 3.504313379 seconds time elapsed + 9,780,533,240 cycles # 2.721 GHz + 18,954,616,108 instructions # 1.94 insn per cycle + 3.596015305 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.766168e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.359450e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.359450e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.719226e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.286801e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.286801e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.977432 sec +TOTAL : 4.082921 sec INFO: No Floating Point Exceptions have been reported - 8,362,626,878 cycles # 2.101 GHz - 15,058,722,791 instructions # 1.80 insn per cycle - 3.982532855 seconds time elapsed + 8,420,905,067 cycles # 2.060 GHz + 15,057,078,071 instructions # 1.79 insn per cycle + 4.088371462 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index f9780717c1..05d5e2d3d7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-16_14:34:35 +DATE: 2024-06-02_20:51:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.831074e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.944999e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.163112e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.579969e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.686020e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.157503e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.663702 sec +TOTAL : 0.687280 sec INFO: No Floating Point Exceptions have been reported - 2,550,713,530 cycles # 2.845 GHz - 3,995,712,636 instructions # 1.57 insn per cycle - 0.958037940 seconds time elapsed + 2,618,766,619 cycles # 2.826 GHz + 4,041,753,204 instructions # 1.54 insn per cycle + 0.983494772 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.088480e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.075110e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.266502e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.266502e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.193535 sec +TOTAL : 6.273174 sec INFO: No Floating Point Exceptions have been reported - 17,967,058,694 cycles # 2.899 GHz - 42,467,805,223 instructions # 2.36 insn per cycle - 6.198684795 seconds time elapsed + 17,976,200,983 cycles # 2.863 GHz + 42,467,527,484 instructions # 2.36 insn per cycle + 6.278790336 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.676284e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.231904e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.231904e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.651801e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.199325e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.199325e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.170684 sec +TOTAL : 4.233700 sec INFO: No Floating Point Exceptions have been reported - 12,134,694,075 cycles # 2.906 GHz - 30,224,929,059 instructions # 2.49 insn per cycle - 4.175943490 seconds time elapsed + 12,135,466,974 cycles # 2.863 GHz + 30,227,050,455 instructions # 2.49 insn per cycle + 4.239313548 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.950812e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.735198e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.735198e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.928405e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.679872e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.679872e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.639278 sec +TOTAL : 3.683536 sec INFO: No Floating Point Exceptions have been reported - 10,078,657,444 cycles # 2.766 GHz - 19,257,126,653 instructions # 1.91 insn per cycle - 3.644365244 seconds time elapsed + 10,048,331,425 cycles # 2.724 GHz + 19,255,994,226 instructions # 1.92 insn per cycle + 3.689022984 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.049769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.898049e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.898049e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.015749e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.854641e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.854641e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.481211 sec +TOTAL : 3.541051 sec INFO: No Floating Point Exceptions have been reported - 9,647,917,970 cycles # 2.768 GHz - 18,746,418,128 instructions # 1.94 insn per cycle - 3.486360008 seconds time elapsed + 9,640,245,530 cycles # 2.719 GHz + 18,744,573,817 instructions # 1.94 insn per cycle + 3.546545299 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.796433e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.409552e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.409552e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.763865e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.360552e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.360552e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.919050 sec +TOTAL : 3.989990 sec INFO: No Floating Point Exceptions have been reported - 8,244,471,456 cycles # 2.102 GHz - 14,980,246,059 instructions # 1.82 insn per cycle - 3.924194596 seconds time elapsed + 8,258,527,369 cycles # 2.068 GHz + 14,978,587,265 instructions # 1.81 insn per cycle + 3.995476429 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 205a4bf5b6..e6ca4b3727 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:35:05 +DATE: 2024-06-02_20:51:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.201162e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181610e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277713e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.571906e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165675e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.278273e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.525559 sec +TOTAL : 0.533824 sec INFO: No Floating Point Exceptions have been reported - 2,155,305,398 cycles # 2.849 GHz - 3,120,666,963 instructions # 1.45 insn per cycle - 0.814520269 seconds time elapsed + 2,160,338,708 cycles # 2.816 GHz + 3,108,947,549 instructions # 1.44 insn per cycle + 0.826788991 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.068773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.129905e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.129905e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.052422e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.113200e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.113200e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.176250 sec +TOTAL : 5.219551 sec INFO: No Floating Point Exceptions have been reported - 15,001,077,825 cycles # 2.896 GHz - 38,374,710,401 instructions # 2.56 insn per cycle - 5.181415080 seconds time elapsed + 14,990,786,657 cycles # 2.870 GHz + 38,373,509,892 instructions # 2.56 insn per cycle + 5.225136790 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.492980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.684039e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.684039e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.420355e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.607688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607688e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.109528 sec +TOTAL : 3.176496 sec INFO: No Floating Point Exceptions have been reported - 9,049,547,879 cycles # 2.906 GHz - 24,578,150,431 instructions # 2.72 insn per cycle - 3.114795475 seconds time elapsed + 9,107,869,375 cycles # 2.863 GHz + 24,577,368,445 instructions # 2.70 insn per cycle + 3.182042391 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.554648e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.034559e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.034559e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.505330e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.986443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.986443e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.996517 sec +TOTAL : 2.015782 sec INFO: No Floating Point Exceptions have been reported - 5,443,502,791 cycles # 2.721 GHz - 11,251,469,346 instructions # 2.07 insn per cycle - 2.001703471 seconds time elapsed + 5,458,675,505 cycles # 2.701 GHz + 11,252,130,547 instructions # 2.06 insn per cycle + 2.021434813 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.119114e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.713742e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.713742e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.084191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.669186e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.669186e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.821745 sec +TOTAL : 1.833429 sec INFO: No Floating Point Exceptions have been reported - 4,960,408,882 cycles # 2.716 GHz - 10,558,806,229 instructions # 2.13 insn per cycle - 1.826903839 seconds time elapsed + 4,938,916,416 cycles # 2.687 GHz + 10,556,489,069 instructions # 2.14 insn per cycle + 1.839288416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.693426e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.898518e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.898518e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.610931e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.807983e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.807983e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.947911 sec +TOTAL : 3.014130 sec INFO: No Floating Point Exceptions have been reported - 5,367,244,097 cycles # 1.818 GHz - 7,793,958,391 instructions # 1.45 insn per cycle - 2.953294554 seconds time elapsed + 5,379,542,844 cycles # 1.782 GHz + 7,793,225,348 instructions # 1.45 insn per cycle + 3.019714352 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 4b2366d44f..1fa6968bab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:00:55 +DATE: 2024-06-02_21:27:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.373758e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.924060e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.924060e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.462166e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.205101e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.205101e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.825705 sec +TOTAL : 0.820247 sec INFO: No Floating Point Exceptions have been reported - 3,037,157,201 cycles # 2.832 GHz - 4,768,877,833 instructions # 1.57 insn per cycle - 1.128818887 seconds time elapsed + 3,038,196,334 cycles # 2.828 GHz + 4,716,972,261 instructions # 1.55 insn per cycle + 1.131805276 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,15 +90,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.032947e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.092197e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.092197e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.034127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.094165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.094165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.344113 sec +TOTAL : 5.347576 sec INFO: No Floating Point Exceptions have been reported - 15,315,317,736 cycles # 2.863 GHz - 38,433,762,310 instructions # 2.51 insn per cycle - 5.351126978 seconds time elapsed + 15,332,963,137 cycles # 2.864 GHz + 38,433,385,565 instructions # 2.51 insn per cycle + 5.354839876 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,15 +119,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.394451e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.578816e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.578816e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.391806e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576311e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576311e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.279010 sec +TOTAL : 3.286080 sec INFO: No Floating Point Exceptions have been reported - 9,390,215,737 cycles # 2.859 GHz - 24,761,602,813 instructions # 2.64 insn per cycle - 3.285914811 seconds time elapsed + 9,426,586,757 cycles # 2.864 GHz + 24,763,935,790 instructions # 2.63 insn per cycle + 3.293082254 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -148,15 +148,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.346272e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.804430e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.804430e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.330091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.783188e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.783188e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.151067 sec +TOTAL : 2.166949 sec INFO: No Floating Point Exceptions have been reported - 5,795,064,676 cycles # 2.687 GHz - 11,538,955,643 instructions # 1.99 insn per cycle - 2.157987463 seconds time elapsed + 5,830,307,091 cycles # 2.683 GHz + 11,537,845,857 instructions # 1.98 insn per cycle + 2.174147046 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -177,15 +177,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.949125e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.512113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.512113e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.935200e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.505542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.505542e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.953091 sec +TOTAL : 1.966012 sec INFO: No Floating Point Exceptions have been reported - 5,277,608,562 cycles # 2.695 GHz - 10,845,633,589 instructions # 2.06 insn per cycle - 1.960046746 seconds time elapsed + 5,308,540,347 cycles # 2.692 GHz + 10,845,350,411 instructions # 2.04 insn per cycle + 1.973191716 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -206,15 +206,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.545325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.736253e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.736253e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.557121e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.748967e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.748967e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.149111 sec +TOTAL : 3.148015 sec INFO: No Floating Point Exceptions have been reported - 5,725,568,726 cycles # 1.815 GHz - 8,037,864,149 instructions # 1.40 insn per cycle - 3.156036160 seconds time elapsed + 5,759,134,449 cycles # 1.827 GHz + 8,037,556,808 instructions # 1.40 insn per cycle + 3.155199268 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 66fdf9efe4..564b56aaa2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:11:59 +DATE: 2024-06-02_21:38:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.582142e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158915e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274993e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.834225e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174853e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276595e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.629579 sec +TOTAL : 0.628184 sec INFO: No Floating Point Exceptions have been reported - 2,438,671,292 cycles # 2.828 GHz - 3,557,518,240 instructions # 1.46 insn per cycle - 0.918692112 seconds time elapsed + 2,420,276,537 cycles # 2.823 GHz + 3,537,411,505 instructions # 1.46 insn per cycle + 0.914703116 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.063642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.124319e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.124319e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.042150e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.102920e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.102920e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.250994 sec +TOTAL : 5.309711 sec INFO: No Floating Point Exceptions have been reported - 15,177,224,624 cycles # 2.888 GHz - 38,389,589,114 instructions # 2.53 insn per cycle - 5.256694767 seconds time elapsed + 15,208,601,758 cycles # 2.862 GHz + 38,393,755,680 instructions # 2.52 insn per cycle + 5.315290014 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.469246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.659787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.659787e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.440689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.630223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630223e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.192043 sec +TOTAL : 3.221641 sec INFO: No Floating Point Exceptions have been reported - 9,234,869,625 cycles # 2.889 GHz - 24,577,322,685 instructions # 2.66 insn per cycle - 3.197667860 seconds time elapsed + 9,237,771,832 cycles # 2.863 GHz + 24,577,605,010 instructions # 2.66 insn per cycle + 3.227257120 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.520662e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.999169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.999169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.426554e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.901258e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.901258e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.069942 sec +TOTAL : 2.107966 sec INFO: No Floating Point Exceptions have been reported - 5,642,462,557 cycles # 2.720 GHz - 11,233,692,701 instructions # 1.99 insn per cycle - 2.075542898 seconds time elapsed + 5,642,384,352 cycles # 2.670 GHz + 11,234,139,166 instructions # 1.99 insn per cycle + 2.113587927 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.151383e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.740134e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.740134e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.038246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.623434e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.623434e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.875924 sec +TOTAL : 1.913383 sec INFO: No Floating Point Exceptions have been reported - 5,122,190,825 cycles # 2.724 GHz - 10,508,387,782 instructions # 2.05 insn per cycle - 1.881606947 seconds time elapsed + 5,136,905,922 cycles # 2.679 GHz + 10,506,331,510 instructions # 2.05 insn per cycle + 1.919115165 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.617306e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.815381e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.815381e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.577150e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.773033e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.773033e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.070946 sec +TOTAL : 3.107472 sec INFO: No Floating Point Exceptions have been reported - 5,582,158,144 cycles # 1.816 GHz - 7,742,870,902 instructions # 1.39 insn per cycle - 3.076599052 seconds time elapsed + 5,603,483,278 cycles # 1.801 GHz + 7,744,927,992 instructions # 1.38 insn per cycle + 3.113151850 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 82194f6fe3..1ac31e13f6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:09:12 +DATE: 2024-06-02_21:35:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.587611e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161872e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276844e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.828351e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175882e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277851e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.568860 sec +TOTAL : 0.567256 sec INFO: No Floating Point Exceptions have been reported - 2,269,706,021 cycles # 2.822 GHz - 3,484,022,632 instructions # 1.54 insn per cycle - 0.860923648 seconds time elapsed + 2,253,776,484 cycles # 2.822 GHz + 3,518,982,258 instructions # 1.56 insn per cycle + 0.855545893 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045564e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.105865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.105865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.056356e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.117975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.117975e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.235402 sec +TOTAL : 5.211101 sec INFO: No Floating Point Exceptions have been reported - 14,997,948,844 cycles # 2.862 GHz - 38,373,416,469 instructions # 2.56 insn per cycle - 5.240872993 seconds time elapsed + 15,016,692,408 cycles # 2.879 GHz + 38,373,187,740 instructions # 2.56 insn per cycle + 5.216643799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.444491e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.632712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.632712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.467027e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.660047e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.660047e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.153505 sec +TOTAL : 3.136723 sec INFO: No Floating Point Exceptions have been reported - 9,049,779,346 cycles # 2.866 GHz - 24,577,971,625 instructions # 2.72 insn per cycle - 3.158944927 seconds time elapsed + 9,074,004,106 cycles # 2.889 GHz + 24,577,979,212 instructions # 2.71 insn per cycle + 3.142372869 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.437117e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.904229e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.904229e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.443155e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.918966e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.918966e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.038232 sec +TOTAL : 2.039960 sec INFO: No Floating Point Exceptions have been reported - 5,473,582,641 cycles # 2.680 GHz - 11,251,858,191 instructions # 2.06 insn per cycle - 2.043714380 seconds time elapsed + 5,475,053,241 cycles # 2.677 GHz + 11,251,295,295 instructions # 2.06 insn per cycle + 2.045540905 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.066218e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.650713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.650713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.050564e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.643604e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.643604e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.837947 sec +TOTAL : 1.847523 sec INFO: No Floating Point Exceptions have been reported - 4,942,309,563 cycles # 2.682 GHz - 10,557,200,123 instructions # 2.14 insn per cycle - 1.844865568 seconds time elapsed + 4,963,669,399 cycles # 2.679 GHz + 10,556,626,951 instructions # 2.13 insn per cycle + 1.853394046 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.598977e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.794496e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.794496e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.592373e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.788368e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.788368e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.023039 sec +TOTAL : 3.031855 sec INFO: No Floating Point Exceptions have been reported - 5,367,715,100 cycles # 1.773 GHz - 7,793,769,749 instructions # 1.45 insn per cycle - 3.028517366 seconds time elapsed + 5,406,083,686 cycles # 1.780 GHz + 7,793,724,258 instructions # 1.44 insn per cycle + 3.037456860 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 3db0a99453..5a92d6747d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:06:28 +DATE: 2024-06-02_21:32:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.591450e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156507e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275190e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.752555e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167872e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274609e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.728834 sec +TOTAL : 0.720859 sec INFO: No Floating Point Exceptions have been reported - 2,711,621,820 cycles # 2.826 GHz - 4,288,575,941 instructions # 1.58 insn per cycle - 1.017933550 seconds time elapsed + 2,687,445,313 cycles # 2.827 GHz + 4,266,641,488 instructions # 1.59 insn per cycle + 1.008523478 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -83,15 +83,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045774e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.105988e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.105988e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.046564e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.107338e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.107338e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.234926 sec +TOTAL : 5.235890 sec INFO: No Floating Point Exceptions have been reported - 14,996,539,700 cycles # 2.862 GHz - 38,373,492,139 instructions # 2.56 insn per cycle - 5.240540958 seconds time elapsed + 15,009,426,379 cycles # 2.865 GHz + 38,373,420,682 instructions # 2.56 insn per cycle + 5.241597173 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.431303e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.617753e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.617753e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.437646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.628123e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.628123e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.165641 sec +TOTAL : 3.163337 sec INFO: No Floating Point Exceptions have been reported - 9,072,261,960 cycles # 2.862 GHz - 24,578,342,604 instructions # 2.71 insn per cycle - 3.171145800 seconds time elapsed + 9,075,620,291 cycles # 2.865 GHz + 24,578,067,979 instructions # 2.71 insn per cycle + 3.168963563 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -139,15 +139,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.460196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.936686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.936686e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.418817e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.893074e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.893074e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.030403 sec +TOTAL : 2.047725 sec INFO: No Floating Point Exceptions have been reported - 5,452,336,471 cycles # 2.679 GHz - 11,251,160,510 instructions # 2.06 insn per cycle - 2.035938093 seconds time elapsed + 5,485,987,071 cycles # 2.673 GHz + 11,251,055,584 instructions # 2.05 insn per cycle + 2.053325818 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.063893e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.649981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.649981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.062072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.650490e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.650490e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.838982 sec +TOTAL : 1.841979 sec INFO: No Floating Point Exceptions have been reported - 4,938,631,038 cycles # 2.680 GHz - 10,556,930,414 instructions # 2.14 insn per cycle - 1.844618889 seconds time elapsed + 4,952,856,258 cycles # 2.682 GHz + 10,558,518,877 instructions # 2.13 insn per cycle + 1.847644905 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -195,15 +195,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.589787e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.785615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.785615e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.597246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.793746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.793746e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.030446 sec +TOTAL : 3.028351 sec INFO: No Floating Point Exceptions have been reported - 5,385,276,295 cycles # 1.774 GHz - 7,793,583,016 instructions # 1.45 insn per cycle - 3.036161028 seconds time elapsed + 5,400,904,302 cycles # 1.781 GHz + 7,793,425,655 instructions # 1.44 insn per cycle + 3.034012590 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 0caf1293cf..58e2659367 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:35:28 +DATE: 2024-06-02_20:52:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.206695e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183658e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279171e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.589601e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168593e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279897e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.521961 sec +TOTAL : 0.529965 sec INFO: No Floating Point Exceptions have been reported - 2,148,802,757 cycles # 2.845 GHz - 3,054,152,486 instructions # 1.42 insn per cycle - 0.812117976 seconds time elapsed + 2,176,096,086 cycles # 2.821 GHz + 3,116,651,489 instructions # 1.43 insn per cycle + 0.828693316 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.068168e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.129039e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.129039e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.046807e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.107510e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.107510e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.177399 sec +TOTAL : 5.232787 sec INFO: No Floating Point Exceptions have been reported - 15,011,872,798 cycles # 2.897 GHz - 40,100,761,049 instructions # 2.67 insn per cycle - 5.182501125 seconds time elapsed + 15,011,109,733 cycles # 2.866 GHz + 40,100,143,330 instructions # 2.67 insn per cycle + 5.238308472 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.634343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.844834e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.844834e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.594471e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.800449e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.800449e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.993727 sec +TOTAL : 3.028090 sec INFO: No Floating Point Exceptions have been reported - 8,671,029,072 cycles # 2.892 GHz - 23,670,969,931 instructions # 2.73 insn per cycle - 2.999072752 seconds time elapsed + 8,685,388,720 cycles # 2.864 GHz + 23,672,029,686 instructions # 2.73 insn per cycle + 3.033712399 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.945254e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.323667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.323667e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.870985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.244397e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.244397e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.228700 sec +TOTAL : 2.263752 sec INFO: No Floating Point Exceptions have been reported - 6,081,438,462 cycles # 2.724 GHz - 13,061,002,322 instructions # 2.15 insn per cycle - 2.233958089 seconds time elapsed + 6,080,549,535 cycles # 2.681 GHz + 13,060,990,924 instructions # 2.15 insn per cycle + 2.269275292 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.205594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.622405e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.622405e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.126541e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.542200e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.542200e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.121856 sec +TOTAL : 2.155718 sec INFO: No Floating Point Exceptions have been reported - 5,798,891,312 cycles # 2.727 GHz - 12,319,969,769 instructions # 2.12 insn per cycle - 2.127030294 seconds time elapsed + 5,801,329,189 cycles # 2.685 GHz + 12,321,707,264 instructions # 2.12 insn per cycle + 2.161291942 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.380432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.550251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.550251e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.300125e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.464144e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.464144e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.209519 sec +TOTAL : 3.288245 sec INFO: No Floating Point Exceptions have been reported - 5,821,355,640 cycles # 1.812 GHz - 9,603,981,726 instructions # 1.65 insn per cycle - 3.214724733 seconds time elapsed + 5,828,079,793 cycles # 1.770 GHz + 9,603,396,173 instructions # 1.65 insn per cycle + 3.293946839 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 6af05ea7e1..eacee14a97 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:51:55 +DATE: 2024-06-02_21:18:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.681198e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.166116e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276872e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.680825e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167882e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277071e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529611 sec +TOTAL : 0.530821 sec INFO: No Floating Point Exceptions have been reported - 2,190,477,637 cycles # 2.832 GHz - 3,135,955,530 instructions # 1.43 insn per cycle - 0.830299558 seconds time elapsed + 2,180,475,973 cycles # 2.822 GHz + 3,129,539,684 instructions # 1.44 insn per cycle + 0.829975432 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.383572e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.466296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.466296e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.369428e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.450686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.450686e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.507918 sec +TOTAL : 4.536778 sec INFO: No Floating Point Exceptions have been reported - 13,013,442,526 cycles # 2.884 GHz - 34,387,029,075 instructions # 2.64 insn per cycle - 4.513459426 seconds time elapsed + 13,015,636,000 cycles # 2.866 GHz + 34,387,703,055 instructions # 2.64 insn per cycle + 4.542541003 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.946707e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.083881e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.083881e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.924099e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.059855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.059855e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.667816 sec +TOTAL : 3.697505 sec INFO: No Floating Point Exceptions have been reported - 10,591,846,077 cycles # 2.884 GHz - 24,007,245,790 instructions # 2.27 insn per cycle - 3.673406920 seconds time elapsed + 10,607,346,172 cycles # 2.865 GHz + 24,007,082,338 instructions # 2.26 insn per cycle + 3.703200013 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.532632e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.849376e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.849376e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.415164e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.721166e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.721166e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.423240 sec +TOTAL : 2.486573 sec INFO: No Floating Point Exceptions have been reported - 6,577,855,979 cycles # 2.709 GHz - 12,401,365,684 instructions # 1.89 insn per cycle - 2.428791768 seconds time elapsed + 6,676,542,764 cycles # 2.680 GHz + 12,401,383,261 instructions # 1.86 insn per cycle + 2.492408166 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.754457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.104775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.104775e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.719363e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.070259e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.070259e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.314834 sec +TOTAL : 2.333222 sec INFO: No Floating Point Exceptions have been reported - 6,233,998,487 cycles # 2.688 GHz - 11,576,068,199 instructions # 1.86 insn per cycle - 2.320534715 seconds time elapsed + 6,249,988,604 cycles # 2.673 GHz + 11,572,934,567 instructions # 1.85 insn per cycle + 2.339105101 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.687851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.893233e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.893233e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.635891e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.836056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.836056e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.952132 sec +TOTAL : 2.994997 sec INFO: No Floating Point Exceptions have been reported - 5,323,772,693 cycles # 1.802 GHz - 9,296,912,008 instructions # 1.75 insn per cycle - 2.957828928 seconds time elapsed + 5,329,559,296 cycles # 1.777 GHz + 9,295,784,708 instructions # 1.74 insn per cycle + 3.000603709 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 2040ec21eb..2a7449ccf8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:52:19 +DATE: 2024-06-02_21:18:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.680230e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168644e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.280417e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.690021e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.170643e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281113e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532171 sec +TOTAL : 0.529016 sec INFO: No Floating Point Exceptions have been reported - 2,169,507,018 cycles # 2.828 GHz - 3,115,355,964 instructions # 1.44 insn per cycle - 0.826043020 seconds time elapsed + 2,181,786,873 cycles # 2.820 GHz + 3,123,798,739 instructions # 1.43 insn per cycle + 0.830314726 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.524819e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.617052e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.617052e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.498656e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.589488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.589488e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.262483 sec +TOTAL : 4.307566 sec INFO: No Floating Point Exceptions have been reported - 12,358,560,610 cycles # 2.896 GHz - 35,037,446,637 instructions # 2.84 insn per cycle - 4.268207887 seconds time elapsed + 12,355,577,175 cycles # 2.865 GHz + 35,037,181,929 instructions # 2.84 insn per cycle + 4.313266681 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.908483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.040450e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.040450e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.900568e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.034514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.034514e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.714757 sec +TOTAL : 3.726420 sec INFO: No Floating Point Exceptions have been reported - 10,745,562,014 cycles # 2.889 GHz - 23,084,374,218 instructions # 2.15 insn per cycle - 3.720383315 seconds time elapsed + 10,682,800,271 cycles # 2.863 GHz + 23,083,133,822 instructions # 2.16 insn per cycle + 3.732131064 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.878271e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.246530e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.246530e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.781411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.142736e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.142736e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.258864 sec +TOTAL : 2.304487 sec INFO: No Floating Point Exceptions have been reported - 6,151,591,588 cycles # 2.717 GHz - 11,956,808,073 instructions # 1.94 insn per cycle - 2.264473200 seconds time elapsed + 6,156,370,918 cycles # 2.666 GHz + 11,956,053,429 instructions # 1.94 insn per cycle + 2.310098952 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.958079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.345089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.345089e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.888437e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.261805e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.261805e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.224234 sec +TOTAL : 2.256063 sec INFO: No Floating Point Exceptions have been reported - 6,017,653,055 cycles # 2.700 GHz - 11,128,128,624 instructions # 1.85 insn per cycle - 2.229785356 seconds time elapsed + 6,010,669,476 cycles # 2.659 GHz + 11,128,968,945 instructions # 1.85 insn per cycle + 2.261765055 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.739650e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.951827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.951827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.710789e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.919169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.919169e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.913360 sec +TOTAL : 2.936689 sec INFO: No Floating Point Exceptions have been reported - 5,212,798,448 cycles # 1.786 GHz - 9,020,884,070 instructions # 1.73 insn per cycle - 2.919040069 seconds time elapsed + 5,226,987,569 cycles # 1.777 GHz + 9,022,159,593 instructions # 1.73 insn per cycle + 2.942633203 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 93f412dad4..109477ba28 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:35:52 +DATE: 2024-06-02_20:52:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.088595e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.705968e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969781e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.016847e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.697629e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.981893e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.482195 sec +TOTAL : 0.482617 sec INFO: No Floating Point Exceptions have been reported - 2,007,920,858 cycles # 2.849 GHz - 2,840,933,430 instructions # 1.41 insn per cycle - 0.763422225 seconds time elapsed + 1,998,642,469 cycles # 2.818 GHz + 2,880,747,261 instructions # 1.44 insn per cycle + 0.766260624 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.200574e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.271569e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.271569e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.189515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.261171e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.261171e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.850875 sec +TOTAL : 4.877134 sec INFO: No Floating Point Exceptions have been reported - 14,073,569,281 cycles # 2.899 GHz - 38,343,239,881 instructions # 2.72 insn per cycle - 4.855897587 seconds time elapsed + 13,994,891,793 cycles # 2.867 GHz + 38,340,768,488 instructions # 2.74 insn per cycle + 4.882458026 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.925449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.332953e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.332953e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.868397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.268797e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.268797e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.217076 sec +TOTAL : 2.243083 sec INFO: No Floating Point Exceptions have been reported - 6,436,588,824 cycles # 2.899 GHz - 15,815,821,412 instructions # 2.46 insn per cycle - 2.222049918 seconds time elapsed + 6,441,514,611 cycles # 2.866 GHz + 15,815,172,638 instructions # 2.46 insn per cycle + 2.248472682 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.963004e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029520e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.029520e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.776329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.004987e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.004987e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.256665 sec +TOTAL : 1.283215 sec INFO: No Floating Point Exceptions have been reported - 3,455,760,948 cycles # 2.740 GHz - 7,593,976,565 instructions # 2.20 insn per cycle - 1.261861875 seconds time elapsed + 3,465,053,995 cycles # 2.691 GHz + 7,593,444,901 instructions # 2.19 insn per cycle + 1.288569725 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.569986e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110539e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110539e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.465176e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096622e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096622e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.182427 sec +TOTAL : 1.196628 sec INFO: No Floating Point Exceptions have been reported - 3,244,770,474 cycles # 2.734 GHz - 7,203,559,407 instructions # 2.22 insn per cycle - 1.187623854 seconds time elapsed + 3,245,802,002 cycles # 2.702 GHz + 7,203,049,725 instructions # 2.22 insn per cycle + 1.202115916 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.864494e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.605662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.605662e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.681015e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.390488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.390488e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.614546 sec +TOTAL : 1.658088 sec INFO: No Floating Point Exceptions have been reported - 3,050,749,421 cycles # 1.885 GHz - 5,835,755,685 instructions # 1.91 insn per cycle - 1.619564037 seconds time elapsed + 3,068,008,054 cycles # 1.846 GHz + 5,834,677,054 instructions # 1.90 insn per cycle + 1.663441620 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 426db838d7..ecf1f25eca 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:01:19 +DATE: 2024-06-02_21:27:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.801236e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.462846e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.462846e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.942422e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.805237e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.805237e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.684862 sec +TOTAL : 0.682307 sec INFO: No Floating Point Exceptions have been reported - 2,586,573,508 cycles # 2.828 GHz - 4,016,406,941 instructions # 1.55 insn per cycle - 0.971565490 seconds time elapsed + 2,573,557,720 cycles # 2.826 GHz + 4,026,704,050 instructions # 1.56 insn per cycle + 0.968234758 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,15 +90,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.176436e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247449e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.164148e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.234526e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.234526e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.948763 sec +TOTAL : 4.979751 sec INFO: No Floating Point Exceptions have been reported - 14,176,104,430 cycles # 2.862 GHz - 38,383,843,895 instructions # 2.71 insn per cycle - 4.955194603 seconds time elapsed + 14,265,943,518 cycles # 2.864 GHz + 38,385,772,523 instructions # 2.69 insn per cycle + 4.986458560 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,15 +119,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.809798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.200764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.200764e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.818621e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.210781e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.210781e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.315849 sec +TOTAL : 2.313915 sec INFO: No Floating Point Exceptions have been reported - 6,633,418,276 cycles # 2.858 GHz - 16,095,968,093 instructions # 2.43 insn per cycle - 2.322298973 seconds time elapsed + 6,643,916,276 cycles # 2.864 GHz + 16,095,500,762 instructions # 2.42 insn per cycle + 2.320575344 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -148,15 +148,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.679036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.925640e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.925640e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.616473e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.887402e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.887402e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.342355 sec +TOTAL : 1.355624 sec INFO: No Floating Point Exceptions have been reported - 3,640,592,514 cycles # 2.701 GHz - 7,831,268,120 instructions # 2.15 insn per cycle - 1.348786146 seconds time elapsed + 3,674,656,527 cycles # 2.699 GHz + 7,830,907,550 instructions # 2.13 insn per cycle + 1.362279184 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -177,15 +177,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.163700e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.056629e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056629e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.223119e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.069627e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069627e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.278871 sec +TOTAL : 1.276248 sec INFO: No Floating Point Exceptions have been reported - 3,437,646,895 cycles # 2.676 GHz - 7,439,842,858 instructions # 2.16 insn per cycle - 1.285386542 seconds time elapsed + 3,470,678,953 cycles # 2.706 GHz + 7,438,963,141 instructions # 2.14 insn per cycle + 1.283084734 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -206,15 +206,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.597215e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.292791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.292791e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.605496e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.298191e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.298191e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.727252 sec +TOTAL : 1.726954 sec INFO: No Floating Point Exceptions have been reported - 3,258,697,081 cycles # 1.881 GHz - 6,089,840,836 instructions # 1.87 insn per cycle - 1.733818978 seconds time elapsed + 3,271,998,736 cycles # 1.889 GHz + 6,089,399,163 instructions # 1.86 insn per cycle + 1.733677408 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 884891874e..0ad3eafec8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:12:23 +DATE: 2024-06-02_21:38:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.468958e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.648278e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.971571e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.959750e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.678565e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.983275e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.575797 sec +TOTAL : 0.579074 sec INFO: No Floating Point Exceptions have been reported - 2,271,357,910 cycles # 2.845 GHz - 3,342,640,625 instructions # 1.47 insn per cycle - 0.855647595 seconds time elapsed + 2,257,408,465 cycles # 2.820 GHz + 3,319,397,953 instructions # 1.47 insn per cycle + 0.858489340 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.198151e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.269622e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.269622e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.184974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.256690e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.256690e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.914840 sec +TOTAL : 4.945704 sec INFO: No Floating Point Exceptions have been reported - 14,211,276,974 cycles # 2.889 GHz - 38,370,210,397 instructions # 2.70 insn per cycle - 4.920108721 seconds time elapsed + 14,170,625,155 cycles # 2.863 GHz + 38,370,527,150 instructions # 2.71 insn per cycle + 4.951076630 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.892733e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.301573e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.301573e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.860658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.261752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.261752e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.288479 sec +TOTAL : 2.305703 sec INFO: No Floating Point Exceptions have been reported - 6,608,042,838 cycles # 2.882 GHz - 15,829,158,403 instructions # 2.40 insn per cycle - 2.293691008 seconds time elapsed + 6,613,265,265 cycles # 2.862 GHz + 15,828,283,667 instructions # 2.39 insn per cycle + 2.311146715 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.919042e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.023820e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.023820e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.686618e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.970674e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.970674e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.319201 sec +TOTAL : 1.355317 sec INFO: No Floating Point Exceptions have been reported - 3,618,631,378 cycles # 2.734 GHz - 7,578,247,859 instructions # 2.09 insn per cycle - 1.324366743 seconds time elapsed + 3,633,491,212 cycles # 2.672 GHz + 7,578,117,090 instructions # 2.09 insn per cycle + 1.360829033 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.492699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.100151e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.100151e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.321381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.082734e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.082734e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.250231 sec +TOTAL : 1.273623 sec INFO: No Floating Point Exceptions have been reported - 3,418,366,623 cycles # 2.724 GHz - 7,152,275,486 instructions # 2.09 insn per cycle - 1.255758340 seconds time elapsed + 3,431,066,713 cycles # 2.684 GHz + 7,153,252,647 instructions # 2.08 insn per cycle + 1.278962900 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.830732e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.562097e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.562097e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.705447e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.424288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.424288e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.681529 sec +TOTAL : 1.712725 sec INFO: No Floating Point Exceptions have been reported - 3,218,452,038 cycles # 1.909 GHz - 5,786,270,960 instructions # 1.80 insn per cycle - 1.686847993 seconds time elapsed + 3,232,442,354 cycles # 1.882 GHz + 5,785,846,161 instructions # 1.79 insn per cycle + 1.718151231 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 9b5852a8c1..4e4b68c02e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:09:35 +DATE: 2024-06-02_21:35:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.497286e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.653761e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.976765e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.829888e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.672325e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968559e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.520499 sec +TOTAL : 0.521072 sec INFO: No Floating Point Exceptions have been reported - 2,122,949,824 cycles # 2.819 GHz - 3,308,605,661 instructions # 1.56 insn per cycle - 0.811337951 seconds time elapsed + 2,127,774,211 cycles # 2.822 GHz + 3,322,390,866 instructions # 1.56 insn per cycle + 0.812598155 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.187282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.258952e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.258952e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.188202e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.259969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.259969e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.881435 sec +TOTAL : 4.880866 sec INFO: No Floating Point Exceptions have been reported - 13,993,887,356 cycles # 2.864 GHz - 38,340,879,445 instructions # 2.74 insn per cycle - 4.886765699 seconds time elapsed + 14,003,432,297 cycles # 2.867 GHz + 38,341,113,505 instructions # 2.74 insn per cycle + 4.886313267 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.866184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.266559e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.266559e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.865782e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.266690e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.266690e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.243695 sec +TOTAL : 2.245588 sec INFO: No Floating Point Exceptions have been reported - 6,437,628,216 cycles # 2.863 GHz - 15,815,570,005 instructions # 2.46 insn per cycle - 2.248941783 seconds time elapsed + 6,448,223,959 cycles # 2.866 GHz + 15,815,680,836 instructions # 2.45 insn per cycle + 2.250939876 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.699018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.949673e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.949673e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.646940e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.924791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.924791e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.293092 sec +TOTAL : 1.302035 sec INFO: No Floating Point Exceptions have been reported - 3,447,035,685 cycles # 2.657 GHz - 7,594,377,345 instructions # 2.20 insn per cycle - 1.298317015 seconds time elapsed + 3,468,694,819 cycles # 2.655 GHz + 7,593,779,362 instructions # 2.19 insn per cycle + 1.307417897 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.410196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.089229e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.089229e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.341284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.086225e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.086225e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.202332 sec +TOTAL : 1.212139 sec INFO: No Floating Point Exceptions have been reported - 3,248,094,322 cycles # 2.691 GHz - 7,201,883,054 instructions # 2.22 insn per cycle - 1.207739630 seconds time elapsed + 3,264,189,304 cycles # 2.683 GHz + 7,202,777,705 instructions # 2.21 insn per cycle + 1.217586001 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.682713e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.392370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.392370e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.695951e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.442709e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.442709e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.657517 sec +TOTAL : 1.656278 sec INFO: No Floating Point Exceptions have been reported - 3,060,341,406 cycles # 1.842 GHz - 5,836,262,166 instructions # 1.91 insn per cycle - 1.662864711 seconds time elapsed + 3,079,117,307 cycles # 1.854 GHz + 5,835,044,949 instructions # 1.90 insn per cycle + 1.661779093 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 7e3b1fa48e..7d521e9bea 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:06:52 +DATE: 2024-06-02_21:33:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.502594e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.623050e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.943883e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.584988e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.675915e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.967452e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.628666 sec +TOTAL : 0.626563 sec INFO: No Floating Point Exceptions have been reported - 2,403,264,425 cycles # 2.820 GHz - 3,734,811,294 instructions # 1.55 insn per cycle - 0.909767197 seconds time elapsed + 2,397,623,441 cycles # 2.823 GHz + 3,754,246,774 instructions # 1.57 insn per cycle + 0.906158297 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -83,15 +83,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.185686e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.257300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.257300e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.188098e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.260287e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.260287e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.884819 sec +TOTAL : 4.881127 sec INFO: No Floating Point Exceptions have been reported - 13,995,449,913 cycles # 2.863 GHz - 38,340,978,131 instructions # 2.74 insn per cycle - 4.889991891 seconds time elapsed + 14,001,505,232 cycles # 2.866 GHz + 38,340,997,313 instructions # 2.74 insn per cycle + 4.886650264 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.864053e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.263128e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.263128e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.864903e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.265431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.265431e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.244540 sec +TOTAL : 2.245406 sec INFO: No Floating Point Exceptions have been reported - 6,436,419,349 cycles # 2.862 GHz - 15,815,556,279 instructions # 2.46 insn per cycle - 2.249779623 seconds time elapsed + 6,447,851,141 cycles # 2.865 GHz + 15,815,588,340 instructions # 2.45 insn per cycle + 2.250959020 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -139,15 +139,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.799961e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.008748e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.008748e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.822705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.013257e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.013257e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.279634 sec +TOTAL : 1.277005 sec INFO: No Floating Point Exceptions have been reported - 3,447,592,643 cycles # 2.685 GHz - 7,593,708,789 instructions # 2.20 insn per cycle - 1.284877623 seconds time elapsed + 3,462,079,610 cycles # 2.701 GHz + 7,593,569,628 instructions # 2.19 insn per cycle + 1.282260547 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.434984e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092289e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092289e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.456827e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.099917e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.099917e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.198937 sec +TOTAL : 1.197770 sec INFO: No Floating Point Exceptions have been reported - 3,242,375,801 cycles # 2.694 GHz - 7,202,509,960 instructions # 2.22 insn per cycle - 1.204245270 seconds time elapsed + 3,264,255,856 cycles # 2.715 GHz + 7,202,978,053 instructions # 2.21 insn per cycle + 1.203170555 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -195,15 +195,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.713311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.432943e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.432943e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.703762e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.418726e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.418726e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.650810 sec +TOTAL : 1.654681 sec INFO: No Floating Point Exceptions have been reported - 3,050,285,995 cycles # 1.842 GHz - 5,834,789,164 instructions # 1.91 insn per cycle - 1.656446986 seconds time elapsed + 3,071,792,415 cycles # 1.854 GHz + 5,835,969,355 instructions # 1.90 insn per cycle + 1.660064901 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 3e123e6fd7..8b44c0445b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:36:11 +DATE: 2024-06-02_20:52:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.096553e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.763289e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.037690e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.016058e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.669695e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.038322e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480283 sec +TOTAL : 0.483252 sec INFO: No Floating Point Exceptions have been reported - 2,036,711,218 cycles # 2.852 GHz - 2,918,453,967 instructions # 1.43 insn per cycle - 0.771336406 seconds time elapsed + 1,998,362,251 cycles # 2.820 GHz + 2,889,875,535 instructions # 1.45 insn per cycle + 0.765688803 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.166079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.236793e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.236793e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.138992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.207513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.207513e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.926463 sec +TOTAL : 4.990603 sec INFO: No Floating Point Exceptions have been reported - 14,320,299,267 cycles # 2.905 GHz - 39,836,243,439 instructions # 2.78 insn per cycle - 4.931482509 seconds time elapsed + 14,312,095,417 cycles # 2.865 GHz + 39,833,075,351 instructions # 2.78 insn per cycle + 4.996093045 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.723514e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.285593e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.285593e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.631923e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.176181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.176181e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.919156 sec +TOTAL : 1.951672 sec INFO: No Floating Point Exceptions have been reported - 5,582,245,803 cycles # 2.902 GHz - 15,285,424,302 instructions # 2.74 insn per cycle - 1.924109376 seconds time elapsed + 5,596,939,158 cycles # 2.861 GHz + 15,284,742,297 instructions # 2.73 insn per cycle + 1.956996618 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.349024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.991002e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.991002e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.218680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.834401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.834401e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.737963 sec +TOTAL : 1.775280 sec INFO: No Floating Point Exceptions have been reported - 4,749,494,972 cycles # 2.726 GHz - 9,735,095,064 instructions # 2.05 insn per cycle - 1.742978161 seconds time elapsed + 4,757,273,897 cycles # 2.673 GHz + 9,734,524,466 instructions # 2.05 insn per cycle + 1.780718117 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.536931e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.219273e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.219273e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.294887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.927248e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.927248e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.690263 sec +TOTAL : 1.754842 sec INFO: No Floating Point Exceptions have been reported - 4,623,322,631 cycles # 2.728 GHz - 9,325,575,279 instructions # 2.02 insn per cycle - 1.695318457 seconds time elapsed + 4,627,741,209 cycles # 2.630 GHz + 9,326,813,558 instructions # 2.02 insn per cycle + 1.760340221 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.572579e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.052133e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.052133e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.443462e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.905052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.905052e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.968034 sec +TOTAL : 2.015644 sec INFO: No Floating Point Exceptions have been reported - 3,660,831,684 cycles # 1.856 GHz - 7,034,974,988 instructions # 1.92 insn per cycle - 1.973212700 seconds time elapsed + 3,664,457,552 cycles # 1.814 GHz + 7,034,592,113 instructions # 1.92 insn per cycle + 2.021014520 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2610) (512y: 12) (512z: 2220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index c7eded0fc2..0b4aad6d48 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:52:42 +DATE: 2024-06-02_21:19:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.456356e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.657836e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.983561e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.536351e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649763e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.975434e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.487671 sec +TOTAL : 0.490081 sec INFO: No Floating Point Exceptions have been reported - 2,030,099,363 cycles # 2.844 GHz - 2,856,891,631 instructions # 1.41 insn per cycle - 0.771313393 seconds time elapsed + 2,012,235,376 cycles # 2.813 GHz + 2,887,278,665 instructions # 1.43 insn per cycle + 0.773980012 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.397227e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.481743e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.481743e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.387689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.473438e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.473438e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.460576 sec +TOTAL : 4.481561 sec INFO: No Floating Point Exceptions have been reported - 12,588,647,411 cycles # 2.819 GHz - 34,372,288,545 instructions # 2.73 insn per cycle - 4.465853868 seconds time elapsed + 12,595,769,062 cycles # 2.808 GHz + 34,371,859,733 instructions # 2.73 insn per cycle + 4.486981898 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.225217e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.687950e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.687950e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.156182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.609115e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.609115e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.095644 sec +TOTAL : 2.124099 sec INFO: No Floating Point Exceptions have been reported - 6,085,238,066 cycles # 2.897 GHz - 14,860,574,019 instructions # 2.44 insn per cycle - 2.101017455 seconds time elapsed + 6,097,938,896 cycles # 2.864 GHz + 14,860,412,482 instructions # 2.44 insn per cycle + 2.129667458 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.969640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.750011e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.750011e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.956935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.736211e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.736211e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.592133 sec +TOTAL : 1.596219 sec INFO: No Floating Point Exceptions have been reported - 4,316,607,801 cycles # 2.703 GHz - 9,028,975,402 instructions # 2.09 insn per cycle - 1.597664902 seconds time elapsed + 4,283,389,233 cycles # 2.675 GHz + 9,028,537,855 instructions # 2.11 insn per cycle + 1.601731069 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.187100e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.023996e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.023996e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.081544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.893467e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.893467e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.545422 sec +TOTAL : 1.569456 sec INFO: No Floating Point Exceptions have been reported - 4,204,195,380 cycles # 2.712 GHz - 8,663,569,400 instructions # 2.06 insn per cycle - 1.550927334 seconds time elapsed + 4,194,103,331 cycles # 2.666 GHz + 8,663,712,018 instructions # 2.07 insn per cycle + 1.575061670 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.251438e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.680453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.680453e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.158310e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.571602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.571602e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.083936 sec +TOTAL : 2.121701 sec INFO: No Floating Point Exceptions have been reported - 3,833,998,104 cycles # 1.836 GHz - 7,808,361,622 instructions # 2.04 insn per cycle - 2.089489123 seconds time elapsed + 3,840,319,125 cycles # 1.806 GHz + 7,808,340,953 instructions # 2.03 insn per cycle + 2.127208383 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4424) (512y: 0) (512z: 2555) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index aad34f68a4..68145ed810 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:53:02 +DATE: 2024-06-02_21:19:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.520611e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.721194e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.056652e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.581017e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.707013e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.045464e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.485221 sec +TOTAL : 0.486421 sec INFO: No Floating Point Exceptions have been reported - 2,023,639,378 cycles # 2.841 GHz - 2,891,046,466 instructions # 1.43 insn per cycle - 0.769493206 seconds time elapsed + 2,005,170,721 cycles # 2.817 GHz + 2,885,331,094 instructions # 1.44 insn per cycle + 0.769664626 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.614708e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.719370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.719370e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.592369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.693697e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.693697e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.097789 sec +TOTAL : 4.133554 sec INFO: No Floating Point Exceptions have been reported - 11,755,034,517 cycles # 2.866 GHz - 35,108,588,793 instructions # 2.99 insn per cycle - 4.103114971 seconds time elapsed + 11,751,435,134 cycles # 2.840 GHz + 35,107,900,053 instructions # 2.99 insn per cycle + 4.139079374 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.332294e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.809853e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.809853e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.304158e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.779755e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.779755e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.053683 sec +TOTAL : 2.066374 sec INFO: No Floating Point Exceptions have been reported - 5,951,415,517 cycles # 2.891 GHz - 14,470,123,335 instructions # 2.43 insn per cycle - 2.059025817 seconds time elapsed + 5,955,734,514 cycles # 2.876 GHz + 14,470,820,860 instructions # 2.43 insn per cycle + 2.071726681 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.326940e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.191185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.191185e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.262038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.115917e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.115917e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.518155 sec +TOTAL : 1.531929 sec INFO: No Floating Point Exceptions have been reported - 4,152,217,913 cycles # 2.727 GHz - 8,874,854,960 instructions # 2.14 insn per cycle - 1.523530355 seconds time elapsed + 4,141,893,808 cycles # 2.695 GHz + 8,874,492,613 instructions # 2.14 insn per cycle + 1.537451545 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.326335e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.192412e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.192412e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.243724e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.092067e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.092067e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.518142 sec +TOTAL : 1.535382 sec INFO: No Floating Point Exceptions have been reported - 4,138,145,120 cycles # 2.717 GHz - 8,411,511,000 instructions # 2.03 insn per cycle - 1.523559219 seconds time elapsed + 4,153,568,465 cycles # 2.697 GHz + 8,412,828,463 instructions # 2.03 insn per cycle + 1.540818202 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.337364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.777859e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.777859e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.345549e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.788380e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.788380e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.053123 sec +TOTAL : 2.050082 sec INFO: No Floating Point Exceptions have been reported - 3,784,038,038 cycles # 1.840 GHz - 7,702,433,783 instructions # 2.04 insn per cycle - 2.058532499 seconds time elapsed + 3,776,688,425 cycles # 1.838 GHz + 7,700,644,489 instructions # 2.04 insn per cycle + 2.055466903 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3440) (512y: 0) (512z: 2107) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index ff88d5da2d..ac74dccede 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:36:32 +DATE: 2024-06-02_20:53:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.198792e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180605e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275668e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.554923e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166179e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277525e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.521467 sec +TOTAL : 0.530564 sec INFO: No Floating Point Exceptions have been reported - 2,143,649,339 cycles # 2.843 GHz - 3,098,162,725 instructions # 1.45 insn per cycle - 0.810608393 seconds time elapsed + 2,183,555,188 cycles # 2.822 GHz + 3,144,815,139 instructions # 1.44 insn per cycle + 0.830734274 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.033714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.092456e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.092456e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.007712e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.065929e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.065929e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.262850 sec +TOTAL : 5.334891 sec INFO: No Floating Point Exceptions have been reported - 15,278,986,093 cycles # 2.901 GHz - 38,575,389,182 instructions # 2.52 insn per cycle - 5.268064562 seconds time elapsed + 15,289,472,900 cycles # 2.864 GHz + 38,577,953,274 instructions # 2.52 insn per cycle + 5.340497151 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.527314e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.723139e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.723139e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.464249e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.655908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.655908e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.080390 sec +TOTAL : 3.137831 sec INFO: No Floating Point Exceptions have been reported - 8,961,614,258 cycles # 2.906 GHz - 24,226,315,758 instructions # 2.70 insn per cycle - 3.085434765 seconds time elapsed + 8,973,826,641 cycles # 2.856 GHz + 24,223,107,065 instructions # 2.70 insn per cycle + 3.143393706 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.613394e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.100134e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.100134e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.480038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.956414e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.956414e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.976346 sec +TOTAL : 2.024686 sec INFO: No Floating Point Exceptions have been reported - 5,394,338,439 cycles # 2.724 GHz - 11,277,527,499 instructions # 2.09 insn per cycle - 1.981499886 seconds time elapsed + 5,399,489,629 cycles # 2.661 GHz + 11,276,345,804 instructions # 2.09 insn per cycle + 2.030286202 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.276948e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.897611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.087786e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.678670e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.678670e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.778784 sec +TOTAL : 1.832786 sec INFO: No Floating Point Exceptions have been reported - 4,855,499,941 cycles # 2.723 GHz - 10,526,571,188 instructions # 2.17 insn per cycle - 1.784170390 seconds time elapsed + 4,867,249,834 cycles # 2.649 GHz + 10,525,904,310 instructions # 2.16 insn per cycle + 1.838359941 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.815864e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.036087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.036087e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.703127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.911454e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.911454e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.856419 sec +TOTAL : 2.942466 sec INFO: No Floating Point Exceptions have been reported - 5,199,981,370 cycles # 1.818 GHz - 7,603,665,117 instructions # 1.46 insn per cycle - 2.861804972 seconds time elapsed + 5,244,087,950 cycles # 1.780 GHz + 7,604,896,768 instructions # 1.45 insn per cycle + 2.947952496 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 1d76304278..e93587dbb1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_14:36:55 +DATE: 2024-06-02_20:53:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.208651e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184994e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.280716e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.488756e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158798e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.278167e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.522551 sec +TOTAL : 0.532803 sec INFO: No Floating Point Exceptions have been reported - 2,145,230,616 cycles # 2.840 GHz - 3,093,123,772 instructions # 1.44 insn per cycle - 0.812278354 seconds time elapsed + 2,183,879,430 cycles # 2.808 GHz + 3,134,949,776 instructions # 1.44 insn per cycle + 0.834793767 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.021911e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079930e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.079930e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.997980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.055486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.055486e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.294031 sec +TOTAL : 5.359665 sec INFO: No Floating Point Exceptions have been reported - 15,341,153,400 cycles # 2.896 GHz - 40,370,282,827 instructions # 2.63 insn per cycle - 5.299425936 seconds time elapsed + 15,360,836,715 cycles # 2.864 GHz + 40,374,148,262 instructions # 2.63 insn per cycle + 5.365170950 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.710012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.926494e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.926494e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.653686e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.867948e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.867948e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.935201 sec +TOTAL : 2.980978 sec INFO: No Floating Point Exceptions have been reported - 8,515,314,447 cycles # 2.897 GHz - 23,253,613,819 instructions # 2.73 insn per cycle - 2.940392108 seconds time elapsed + 8,537,424,840 cycles # 2.860 GHz + 23,255,933,901 instructions # 2.72 insn per cycle + 2.986536514 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.780066e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.132607e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.132607e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.675454e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.025638e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.025638e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.302191 sec +TOTAL : 2.354095 sec INFO: No Floating Point Exceptions have been reported - 6,262,262,467 cycles # 2.715 GHz - 12,962,490,062 instructions # 2.07 insn per cycle - 2.307689771 seconds time elapsed + 6,271,057,831 cycles # 2.659 GHz + 12,961,948,705 instructions # 2.07 insn per cycle + 2.359820621 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.109643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.511847e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.511847e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.936868e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.317677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.317677e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.160493 sec +TOTAL : 2.234617 sec INFO: No Floating Point Exceptions have been reported - 5,903,466,716 cycles # 2.727 GHz - 12,238,680,442 instructions # 2.07 insn per cycle - 2.165768560 seconds time elapsed + 5,930,155,286 cycles # 2.648 GHz + 12,239,916,481 instructions # 2.06 insn per cycle + 2.240091992 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.507940e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.694154e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.694154e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.444871e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.625260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625260e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.097205 sec +TOTAL : 3.154913 sec INFO: No Floating Point Exceptions have been reported - 5,614,268,818 cycles # 1.810 GHz - 8,744,074,840 instructions # 1.56 insn per cycle - 3.102417520 seconds time elapsed + 5,603,434,208 cycles # 1.774 GHz + 8,746,306,669 instructions # 1.56 insn per cycle + 3.160521781 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 1d7490861d..d2a9436bac 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_14:37:19 +DATE: 2024-06-02_20:53:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.992211e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047041e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061161e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.828792e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.058201e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.072605e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469322 sec +TOTAL : 0.469991 sec INFO: No Floating Point Exceptions have been reported - 1,970,950,644 cycles # 2.853 GHz - 2,836,233,202 instructions # 1.44 insn per cycle - 0.747868437 seconds time elapsed + 1,950,502,519 cycles # 2.813 GHz + 2,806,448,865 instructions # 1.44 insn per cycle + 0.749484229 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.129686e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.329949e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341716e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.080589e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.327376e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.341190e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.607217 sec +TOTAL : 0.609725 sec INFO: No Floating Point Exceptions have been reported - 2,397,125,482 cycles # 2.825 GHz - 3,658,262,516 instructions # 1.53 insn per cycle - 0.909559944 seconds time elapsed + 2,396,738,847 cycles # 2.820 GHz + 3,668,256,125 instructions # 1.53 insn per cycle + 0.908964727 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379379e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.391311e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.391311e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.392275e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.404239e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.404239e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.910347 sec +TOTAL : 6.873703 sec INFO: No Floating Point Exceptions have been reported - 19,789,020,586 cycles # 2.863 GHz - 59,609,829,111 instructions # 3.01 insn per cycle - 6.914699001 seconds time elapsed + 19,791,549,123 cycles # 2.878 GHz + 59,606,317,603 instructions # 3.01 insn per cycle + 6.878041961 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.619966e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.665049e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.665049e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.568257e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.612303e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.612303e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.569460 sec +TOTAL : 3.610171 sec INFO: No Floating Point Exceptions have been reported - 10,374,266,250 cycles # 2.904 GHz - 30,674,256,165 instructions # 2.96 insn per cycle - 3.573646642 seconds time elapsed + 10,370,530,942 cycles # 2.870 GHz + 30,676,186,235 instructions # 2.96 insn per cycle + 3.614641811 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.120184e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.293257e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.293257e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.953227e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.119594e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.119594e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.820051 sec +TOTAL : 1.853859 sec INFO: No Floating Point Exceptions have been reported - 4,901,380,147 cycles # 2.688 GHz - 11,019,047,598 instructions # 2.25 insn per cycle - 1.824311195 seconds time elapsed + 4,895,759,086 cycles # 2.636 GHz + 11,018,740,137 instructions # 2.25 insn per cycle + 1.858168783 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.028182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049956e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049956e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.002285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.022719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.022719e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.616748 sec +TOTAL : 1.658467 sec INFO: No Floating Point Exceptions have been reported - 4,378,615,331 cycles # 2.702 GHz - 10,296,117,856 instructions # 2.35 insn per cycle - 1.621129053 seconds time elapsed + 4,377,789,034 cycles # 2.634 GHz + 10,296,146,857 instructions # 2.35 insn per cycle + 1.662826466 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.954224e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.056280e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.056280e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.862327e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.962239e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.962239e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.381742 sec +TOTAL : 2.413722 sec INFO: No Floating Point Exceptions have been reported - 4,108,596,097 cycles # 1.723 GHz - 5,842,404,115 instructions # 1.42 insn per cycle - 2.385936782 seconds time elapsed + 4,103,494,859 cycles # 1.698 GHz + 5,842,470,718 instructions # 1.42 insn per cycle + 2.418058321 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 45a1ef164b..a85c881c90 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_15:01:40 +DATE: 2024-06-02_21:28:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.535443e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.780857e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.780857e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.555829e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.818637e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.818637e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.503670 sec +TOTAL : 0.500626 sec INFO: No Floating Point Exceptions have been reported - 2,012,376,201 cycles # 2.812 GHz - 3,006,218,540 instructions # 1.49 insn per cycle - 0.774572160 seconds time elapsed + 2,009,885,240 cycles # 2.812 GHz + 3,066,226,195 instructions # 1.53 insn per cycle + 0.771725422 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.606024e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.624765e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.624765e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.701616e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.932321e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.932321e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.841754 sec +TOTAL : 0.829861 sec INFO: No Floating Point Exceptions have been reported - 3,099,668,806 cycles # 2.832 GHz - 4,993,276,525 instructions # 1.61 insn per cycle - 1.155254157 seconds time elapsed + 3,061,531,665 cycles # 2.832 GHz + 4,949,277,454 instructions # 1.62 insn per cycle + 1.138627743 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.380068e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.392068e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.392068e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.385150e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.397380e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.397380e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.915910 sec +TOTAL : 6.902584 sec INFO: No Floating Point Exceptions have been reported - 19,806,579,322 cycles # 2.863 GHz - 59,611,012,266 instructions # 3.01 insn per cycle - 6.920308116 seconds time elapsed + 19,794,872,013 cycles # 2.866 GHz + 59,611,558,170 instructions # 3.01 insn per cycle + 6.907141027 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.550339e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.594733e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.594733e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.557869e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.602026e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.602026e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.631431 sec +TOTAL : 3.626237 sec INFO: No Floating Point Exceptions have been reported - 10,404,134,292 cycles # 2.862 GHz - 30,722,305,980 instructions # 2.95 insn per cycle - 3.635916319 seconds time elapsed + 10,407,659,030 cycles # 2.867 GHz + 30,722,342,234 instructions # 2.95 insn per cycle + 3.630843347 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.991824e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.166141e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.166141e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.913116e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.082779e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.082779e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.853778 sec +TOTAL : 1.870398 sec INFO: No Floating Point Exceptions have been reported - 4,943,570,309 cycles # 2.661 GHz - 11,067,752,215 instructions # 2.24 insn per cycle - 1.858370590 seconds time elapsed + 4,945,675,416 cycles # 2.639 GHz + 11,067,795,090 instructions # 2.24 insn per cycle + 1.874949750 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.005140e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.026682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.026682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.991691e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.020412e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.020412e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.662867 sec +TOTAL : 1.672210 sec INFO: No Floating Point Exceptions have been reported - 4,426,260,539 cycles # 2.656 GHz - 10,346,882,831 instructions # 2.34 insn per cycle - 1.667431238 seconds time elapsed + 4,419,204,279 cycles # 2.637 GHz + 10,345,034,833 instructions # 2.34 insn per cycle + 1.676771727 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.832038e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.932754e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.932754e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.843006e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.942637e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.942637e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.431611 sec +TOTAL : 2.429398 sec INFO: No Floating Point Exceptions have been reported - 4,145,808,516 cycles # 1.702 GHz - 5,880,428,508 instructions # 1.42 insn per cycle - 2.436095886 seconds time elapsed + 4,154,030,623 cycles # 1.707 GHz + 5,882,157,165 instructions # 1.42 insn per cycle + 2.434135528 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index c8d4c1d012..9c5400dc3c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_14:37:45 +DATE: 2024-06-02_20:54:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.984938e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.044546e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056865e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.728787e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040296e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054902e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.468609 sec +TOTAL : 0.469961 sec INFO: No Floating Point Exceptions have been reported - 1,981,002,182 cycles # 2.846 GHz - 2,842,945,772 instructions # 1.44 insn per cycle - 0.752497111 seconds time elapsed + 1,945,140,914 cycles # 2.814 GHz + 2,797,132,683 instructions # 1.44 insn per cycle + 0.748195009 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.119070e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.315352e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.326681e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.071125e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312843e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325872e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.601408 sec +TOTAL : 0.609321 sec INFO: No Floating Point Exceptions have been reported - 2,383,936,937 cycles # 2.851 GHz - 3,651,729,049 instructions # 1.53 insn per cycle - 0.896728355 seconds time elapsed + 2,388,183,446 cycles # 2.820 GHz + 3,624,721,355 instructions # 1.52 insn per cycle + 0.907535221 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.454763e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.467389e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.467389e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.416052e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.428253e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.428253e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.698212 sec +TOTAL : 6.805799 sec INFO: No Floating Point Exceptions have been reported - 19,500,935,732 cycles # 2.911 GHz - 58,799,003,967 instructions # 3.02 insn per cycle - 6.702449206 seconds time elapsed + 19,509,656,309 cycles # 2.865 GHz + 58,797,581,425 instructions # 3.01 insn per cycle + 6.810019541 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.669930e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.715854e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.715854e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.624636e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.669607e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.669607e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.531511 sec +TOTAL : 3.566261 sec INFO: No Floating Point Exceptions have been reported - 10,228,095,464 cycles # 2.894 GHz - 30,347,180,891 instructions # 2.97 insn per cycle - 3.535798492 seconds time elapsed + 10,224,672,260 cycles # 2.864 GHz + 30,345,523,778 instructions # 2.97 insn per cycle + 3.570675168 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.789972e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.950829e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.950829e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.622392e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.780322e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.780322e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.887432 sec +TOTAL : 1.924079 sec INFO: No Floating Point Exceptions have been reported - 5,055,118,079 cycles # 2.674 GHz - 11,484,444,983 instructions # 2.27 insn per cycle - 1.891612421 seconds time elapsed + 5,063,711,278 cycles # 2.628 GHz + 11,483,381,207 instructions # 2.27 insn per cycle + 1.928457003 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.667837e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.860484e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.860484e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.428126e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.609483e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.609483e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.718788 sec +TOTAL : 1.762110 sec INFO: No Floating Point Exceptions have been reported - 4,655,858,880 cycles # 2.704 GHz - 10,842,096,596 instructions # 2.33 insn per cycle - 1.722993406 seconds time elapsed + 4,654,381,781 cycles # 2.637 GHz + 10,841,512,729 instructions # 2.33 insn per cycle + 1.766492012 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.981237e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.082937e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.082937e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.831561e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.933974e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.933974e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.372571 sec +TOTAL : 2.424320 sec INFO: No Floating Point Exceptions have been reported - 4,129,142,877 cycles # 1.738 GHz - 6,106,185,085 instructions # 1.48 insn per cycle - 2.376879303 seconds time elapsed + 4,122,188,832 cycles # 1.698 GHz + 6,106,386,209 instructions # 1.48 insn per cycle + 2.428663337 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index e4bc7cf2cc..25f5a9a1db 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_14:38:10 +DATE: 2024-06-02_20:54:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.514552e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271085e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366020e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.457737e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273716e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366330e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.450662 sec +TOTAL : 0.454346 sec INFO: No Floating Point Exceptions have been reported - 1,888,418,045 cycles # 2.834 GHz - 2,686,004,303 instructions # 1.42 insn per cycle - 0.722549365 seconds time elapsed + 1,882,812,006 cycles # 2.812 GHz + 2,667,678,817 instructions # 1.42 insn per cycle + 0.728316472 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.424662e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.459806e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.527254e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.267220e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.427979e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.526801e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.495261 sec +TOTAL : 0.499240 sec INFO: No Floating Point Exceptions have been reported - 2,099,817,827 cycles # 2.862 GHz - 2,990,738,948 instructions # 1.42 insn per cycle - 0.790419941 seconds time elapsed + 2,052,043,922 cycles # 2.819 GHz + 2,965,868,392 instructions # 1.45 insn per cycle + 0.784455397 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.505220e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.518346e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.518346e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.468259e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.481049e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481049e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.562288 sec +TOTAL : 6.660760 sec INFO: No Floating Point Exceptions have been reported - 19,080,957,547 cycles # 2.906 GHz - 58,959,648,789 instructions # 3.09 insn per cycle - 6.566573323 seconds time elapsed + 19,087,341,831 cycles # 2.864 GHz + 58,960,382,092 instructions # 3.09 insn per cycle + 6.664849133 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.204155e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.352745e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.352745e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.119436e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.261857e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.261857e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.018056 sec +TOTAL : 2.039807 sec INFO: No Floating Point Exceptions have been reported - 5,861,245,947 cycles # 2.899 GHz - 16,693,370,121 instructions # 2.85 insn per cycle - 2.022246601 seconds time elapsed + 5,851,713,678 cycles # 2.864 GHz + 16,693,562,801 instructions # 2.85 insn per cycle + 2.044009980 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.747206e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.811751e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.811751e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.728939e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.791004e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791004e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.959718 sec +TOTAL : 0.969206 sec INFO: No Floating Point Exceptions have been reported - 2,597,973,759 cycles # 2.697 GHz - 5,979,816,432 instructions # 2.30 insn per cycle - 0.963957244 seconds time elapsed + 2,595,644,078 cycles # 2.669 GHz + 5,979,320,953 instructions # 2.30 insn per cycle + 0.973332836 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.928786e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.008064e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.008064e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.907804e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.986792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.986792e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.871454 sec +TOTAL : 0.881217 sec INFO: No Floating Point Exceptions have been reported - 2,346,801,151 cycles # 2.682 GHz - 5,601,970,539 instructions # 2.39 insn per cycle - 0.875813732 seconds time elapsed + 2,345,880,719 cycles # 2.652 GHz + 5,602,748,051 instructions # 2.39 insn per cycle + 0.885574348 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.412327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.455439e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.455439e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.406889e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.449235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.449235e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.184240 sec +TOTAL : 1.188517 sec INFO: No Floating Point Exceptions have been reported - 2,059,493,323 cycles # 1.734 GHz - 3,333,364,881 instructions # 1.62 insn per cycle - 1.188531798 seconds time elapsed + 2,058,034,828 cycles # 1.727 GHz + 3,333,328,616 instructions # 1.62 insn per cycle + 1.192698457 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index d735dc5897..e87a092429 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_15:02:06 +DATE: 2024-06-02_21:28:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.750186e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.085490e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.085490e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.706813e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.038920e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.038920e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.469338 sec +TOTAL : 0.466185 sec INFO: No Floating Point Exceptions have been reported - 1,918,362,944 cycles # 2.804 GHz - 2,834,169,916 instructions # 1.48 insn per cycle - 0.742178075 seconds time elapsed + 1,911,951,129 cycles # 2.814 GHz + 2,841,120,455 instructions # 1.49 insn per cycle + 0.735601781 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.524122e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.570005e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.570005e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.584141e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.645916e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.645916e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.651816 sec +TOTAL : 0.648144 sec INFO: No Floating Point Exceptions have been reported - 2,503,160,784 cycles # 2.822 GHz - 3,832,792,162 instructions # 1.53 insn per cycle - 0.943470239 seconds time elapsed + 2,489,864,035 cycles # 2.825 GHz + 3,827,644,098 instructions # 1.54 insn per cycle + 0.938216487 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.465694e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.479110e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.479110e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.480910e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.493960e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.493960e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.671662 sec +TOTAL : 6.630728 sec INFO: No Floating Point Exceptions have been reported - 19,108,337,453 cycles # 2.863 GHz - 58,967,331,894 instructions # 3.09 insn per cycle - 6.675976597 seconds time elapsed + 19,100,779,078 cycles # 2.879 GHz + 58,964,120,971 instructions # 3.09 insn per cycle + 6.635020831 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.093089e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.238027e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.238027e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.112476e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.258255e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.258255e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.051178 sec +TOTAL : 2.046410 sec INFO: No Floating Point Exceptions have been reported - 5,880,119,320 cycles # 2.862 GHz - 16,741,679,626 instructions # 2.85 insn per cycle - 2.055508197 seconds time elapsed + 5,888,238,325 cycles # 2.872 GHz + 16,741,878,300 instructions # 2.84 insn per cycle + 2.050817243 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.718905e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.782305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.782305e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.723304e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.786360e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.786360e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.980043 sec +TOTAL : 0.977338 sec INFO: No Floating Point Exceptions have been reported - 2,616,418,693 cycles # 2.660 GHz - 6,017,096,104 instructions # 2.30 insn per cycle - 0.984343134 seconds time elapsed + 2,615,739,499 cycles # 2.667 GHz + 6,017,192,558 instructions # 2.30 insn per cycle + 0.981679944 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.912882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.991175e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.991175e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915953e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.994516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.994516e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.883189 sec +TOTAL : 0.881911 sec INFO: No Floating Point Exceptions have been reported - 2,365,822,002 cycles # 2.667 GHz - 5,638,771,692 instructions # 2.38 insn per cycle - 0.887626463 seconds time elapsed + 2,367,964,767 cycles # 2.674 GHz + 5,639,235,283 instructions # 2.38 insn per cycle + 0.886220730 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.399129e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.441231e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.441231e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.400847e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.442998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.442998e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.200076 sec +TOTAL : 1.198499 sec INFO: No Floating Point Exceptions have been reported - 2,081,452,605 cycles # 1.729 GHz - 3,374,965,036 instructions # 1.62 insn per cycle - 1.204429196 seconds time elapsed + 2,084,095,957 cycles # 1.733 GHz + 3,374,916,702 instructions # 1.62 insn per cycle + 1.202990814 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 3d41e21b12..b3b78f68de 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_14:38:31 +DATE: 2024-06-02_20:55:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.548366e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.290418e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382374e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.497603e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.303121e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.398772e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.453301 sec +TOTAL : 0.451033 sec INFO: No Floating Point Exceptions have been reported - 1,884,361,235 cycles # 2.811 GHz - 2,662,129,036 instructions # 1.41 insn per cycle - 0.727401829 seconds time elapsed + 1,878,344,567 cycles # 2.819 GHz + 2,673,672,832 instructions # 1.42 insn per cycle + 0.723231427 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.381856e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.386346e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.451907e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.242647e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.389213e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.475124e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.498921 sec +TOTAL : 0.498086 sec INFO: No Floating Point Exceptions have been reported - 2,065,776,106 cycles # 2.820 GHz - 3,002,526,593 instructions # 1.45 insn per cycle - 0.789720140 seconds time elapsed + 2,053,404,426 cycles # 2.826 GHz + 2,966,260,637 instructions # 1.44 insn per cycle + 0.782979940 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.479714e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.492704e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.492704e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.479080e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.492104e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.492104e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.641350 sec +TOTAL : 6.631451 sec INFO: No Floating Point Exceptions have been reported - 18,978,826,784 cycles # 2.861 GHz - 58,704,221,037 instructions # 3.09 insn per cycle - 6.645410970 seconds time elapsed + 18,984,604,647 cycles # 2.862 GHz + 58,702,110,153 instructions # 3.09 insn per cycle + 6.635623922 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.494310e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.651898e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.651898e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.506447e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.662703e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.662703e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.950028 sec +TOTAL : 1.947376 sec INFO: No Floating Point Exceptions have been reported - 5,589,974,968 cycles # 2.862 GHz - 16,510,304,699 instructions # 2.95 insn per cycle - 1.954264273 seconds time elapsed + 5,589,202,869 cycles # 2.865 GHz + 16,510,174,954 instructions # 2.95 insn per cycle + 1.951631264 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.496639e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.543532e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.543532e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.493410e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.539841e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.539841e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.116418 sec +TOTAL : 1.118677 sec INFO: No Floating Point Exceptions have been reported - 2,975,820,242 cycles # 2.657 GHz - 6,633,799,194 instructions # 2.23 insn per cycle - 1.120575232 seconds time elapsed + 2,976,233,441 cycles # 2.652 GHz + 6,633,667,708 instructions # 2.23 insn per cycle + 1.122890193 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.615016e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.669374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.669374e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.625860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.680500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.680500e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.036246 sec +TOTAL : 1.029211 sec INFO: No Floating Point Exceptions have been reported - 2,759,204,529 cycles # 2.654 GHz - 6,255,102,481 instructions # 2.27 insn per cycle - 1.040401186 seconds time elapsed + 2,757,551,412 cycles # 2.670 GHz + 6,254,933,924 instructions # 2.27 insn per cycle + 1.033320955 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.286831e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.322123e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.322123e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.290714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325632e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.300128 sec +TOTAL : 1.293262 sec INFO: No Floating Point Exceptions have been reported - 2,231,395,652 cycles # 1.715 GHz - 3,699,704,768 instructions # 1.66 insn per cycle - 1.304305216 seconds time elapsed + 2,228,539,636 cycles # 1.719 GHz + 3,697,845,631 instructions # 1.66 insn per cycle + 1.297458184 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2391) (512y: 29) (512z: 3970) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 18990368c8..1aea1ca46b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_14:38:52 +DATE: 2024-06-02_20:55:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.980776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047318e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.059891e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.705675e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040108e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054231e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469526 sec +TOTAL : 0.467689 sec INFO: No Floating Point Exceptions have been reported - 1,950,532,568 cycles # 2.815 GHz - 2,802,706,395 instructions # 1.44 insn per cycle - 0.749158155 seconds time elapsed + 1,951,012,614 cycles # 2.822 GHz + 2,780,514,605 instructions # 1.43 insn per cycle + 0.747566497 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.120585e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.317479e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329114e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.071549e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315004e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.328475e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.606344 sec +TOTAL : 0.609446 sec INFO: No Floating Point Exceptions have been reported - 2,403,151,636 cycles # 2.824 GHz - 3,669,339,361 instructions # 1.53 insn per cycle - 0.910110717 seconds time elapsed + 2,398,520,319 cycles # 2.827 GHz + 3,697,559,551 instructions # 1.54 insn per cycle + 0.906881942 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.348054e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.359694e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.359694e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.346860e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.358448e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.358448e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.005029 sec +TOTAL : 7.005881 sec INFO: No Floating Point Exceptions have been reported - 20,055,951,018 cycles # 2.863 GHz - 60,536,467,053 instructions # 3.02 insn per cycle - 7.009312607 seconds time elapsed + 20,061,545,492 cycles # 2.863 GHz + 60,534,513,586 instructions # 3.02 insn per cycle + 7.010263470 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.638770e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.684822e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.684822e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.628417e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.673382e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.673382e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.556206 sec +TOTAL : 3.563203 sec INFO: No Floating Point Exceptions have been reported - 10,186,602,629 cycles # 2.862 GHz - 30,386,009,701 instructions # 2.98 insn per cycle - 3.560429335 seconds time elapsed + 10,193,843,427 cycles # 2.858 GHz + 30,384,715,959 instructions # 2.98 insn per cycle + 3.567486704 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.050822e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.223334e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.223334e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.060307e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.230512e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.230512e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.833811 sec +TOTAL : 1.832180 sec INFO: No Floating Point Exceptions have been reported - 4,877,548,863 cycles # 2.655 GHz - 10,978,535,397 instructions # 2.25 insn per cycle - 1.838126466 seconds time elapsed + 4,873,743,401 cycles # 2.655 GHz + 10,979,146,931 instructions # 2.25 insn per cycle + 1.836546702 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.034701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.056812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056812e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.032510e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054207e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054207e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.607005 sec +TOTAL : 1.610269 sec INFO: No Floating Point Exceptions have been reported - 4,285,859,041 cycles # 2.661 GHz - 10,248,085,853 instructions # 2.39 insn per cycle - 1.611327735 seconds time elapsed + 4,286,427,813 cycles # 2.656 GHz + 10,247,731,306 instructions # 2.39 insn per cycle + 1.614556045 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.675038e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.769490e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.769490e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.692625e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.784575e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.784575e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.480681 sec +TOTAL : 2.474016 sec INFO: No Floating Point Exceptions have been reported - 4,211,204,679 cycles # 1.695 GHz - 6,044,041,090 instructions # 1.44 insn per cycle - 2.485018889 seconds time elapsed + 4,210,263,291 cycles # 1.700 GHz + 6,043,220,655 instructions # 1.44 insn per cycle + 2.478297594 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index bea6b18082..1c6d0ff5f8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-16_14:39:18 +DATE: 2024-06-02_20:55:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.940348e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041869e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054764e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.735452e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041244e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055299e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469154 sec +TOTAL : 0.469831 sec INFO: No Floating Point Exceptions have been reported - 1,946,414,728 cycles # 2.818 GHz - 2,803,423,086 instructions # 1.44 insn per cycle - 0.748059256 seconds time elapsed + 1,948,866,708 cycles # 2.819 GHz + 2,802,491,668 instructions # 1.44 insn per cycle + 0.748686662 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.116866e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312173e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323463e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070810e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.308874e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.321967e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.604157 sec +TOTAL : 0.605052 sec INFO: No Floating Point Exceptions have been reported - 2,374,249,289 cycles # 2.818 GHz - 3,602,148,119 instructions # 1.52 insn per cycle - 0.902621411 seconds time elapsed + 2,392,252,084 cycles # 2.830 GHz + 3,645,625,496 instructions # 1.52 insn per cycle + 0.903478683 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.368504e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.380280e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.380280e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.367403e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.379210e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.379210e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.941323 sec +TOTAL : 6.944976 sec INFO: No Floating Point Exceptions have been reported - 19,878,296,626 cycles # 2.863 GHz - 59,936,362,271 instructions # 3.02 insn per cycle - 6.945573140 seconds time elapsed + 19,868,797,568 cycles # 2.860 GHz + 59,935,823,047 instructions # 3.02 insn per cycle + 6.949220462 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.689994e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.736297e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.736297e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.689813e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.736104e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.736104e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.516340 sec +TOTAL : 3.516734 sec INFO: No Floating Point Exceptions have been reported - 10,077,314,757 cycles # 2.863 GHz - 30,098,117,657 instructions # 2.99 insn per cycle - 3.520635536 seconds time elapsed + 10,083,295,126 cycles # 2.864 GHz + 30,097,719,684 instructions # 2.98 insn per cycle + 3.521023820 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.778247e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.940877e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.940877e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.780572e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.943341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.943341e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.889938 sec +TOTAL : 1.889615 sec INFO: No Floating Point Exceptions have been reported - 5,023,754,472 cycles # 2.654 GHz - 11,483,522,538 instructions # 2.29 insn per cycle - 1.894205310 seconds time elapsed + 5,024,798,861 cycles # 2.654 GHz + 11,482,219,428 instructions # 2.29 insn per cycle + 1.893950854 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.644687e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.842226e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.842226e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.644667e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.830260e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.830260e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.722080 sec +TOTAL : 1.722143 sec INFO: No Floating Point Exceptions have been reported - 4,590,091,342 cycles # 2.660 GHz - 10,809,457,257 instructions # 2.35 insn per cycle - 1.726406566 seconds time elapsed + 4,588,199,336 cycles # 2.659 GHz + 10,809,611,838 instructions # 2.36 insn per cycle + 1.726402099 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.641517e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.735645e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.735645e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.668589e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.758731e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.758731e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.492729 sec +TOTAL : 2.482767 sec INFO: No Floating Point Exceptions have been reported - 4,229,101,372 cycles # 1.695 GHz - 6,273,394,761 instructions # 1.48 insn per cycle - 2.496999493 seconds time elapsed + 4,227,913,144 cycles # 1.701 GHz + 6,273,317,964 instructions # 1.48 insn per cycle + 2.486971348 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index adf6424639..06aa0981a7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:39:44 +DATE: 2024-06-02_20:56:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.453895e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.477096e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.479397e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.484170e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.510124e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512593e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529744 sec +TOTAL : 0.529714 sec INFO: No Floating Point Exceptions have been reported - 2,179,317,048 cycles # 2.822 GHz - 3,403,036,461 instructions # 1.56 insn per cycle - 0.830470867 seconds time elapsed + 2,187,150,016 cycles # 2.828 GHz + 3,407,204,108 instructions # 1.56 insn per cycle + 0.831981040 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.124157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.151338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.152519e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.126702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.160633e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162012e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.042150 sec +TOTAL : 3.045826 sec INFO: No Floating Point Exceptions have been reported - 9,405,604,432 cycles # 2.853 GHz - 20,118,562,201 instructions # 2.14 insn per cycle - 3.353608047 seconds time elapsed + 9,422,847,840 cycles # 2.851 GHz + 20,052,737,736 instructions # 2.13 insn per cycle + 3.360343374 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.820592e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.821434e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.821434e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.837101e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.837996e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.837996e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.018372 sec +TOTAL : 8.937233 sec INFO: No Floating Point Exceptions have been reported - 25,614,013,948 cycles # 2.839 GHz - 78,938,013,495 instructions # 3.08 insn per cycle - 9.022664733 seconds time elapsed + 25,623,864,923 cycles # 2.867 GHz + 78,942,890,669 instructions # 3.08 insn per cycle + 8.941589016 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.519494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.522699e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.522699e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.527505e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.530758e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.530758e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.669138 sec +TOTAL : 4.659217 sec INFO: No Floating Point Exceptions have been reported - 12,898,966,245 cycles # 2.761 GHz - 39,280,150,365 instructions # 3.05 insn per cycle - 4.673492352 seconds time elapsed + 12,887,852,449 cycles # 2.765 GHz + 39,283,888,678 instructions # 3.05 insn per cycle + 4.663553220 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.859599e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.875346e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.875346e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.819266e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.834708e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.834708e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.097013 sec +TOTAL : 2.106980 sec INFO: No Floating Point Exceptions have been reported - 5,574,685,577 cycles # 2.655 GHz - 13,685,856,406 instructions # 2.46 insn per cycle - 2.101249976 seconds time elapsed + 5,581,493,843 cycles # 2.645 GHz + 13,685,869,165 instructions # 2.45 insn per cycle + 2.111397973 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.915800e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.935807e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.935807e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.940031e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.960919e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.960919e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.848754 sec +TOTAL : 1.844023 sec INFO: No Floating Point Exceptions have been reported - 4,887,101,603 cycles # 2.639 GHz - 12,341,123,817 instructions # 2.53 insn per cycle - 1.853060894 seconds time elapsed + 4,890,407,622 cycles # 2.647 GHz + 12,340,850,912 instructions # 2.52 insn per cycle + 1.848367657 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.728417e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.739729e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.739729e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.735647e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.747350e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.747350e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.446881 sec +TOTAL : 2.444477 sec INFO: No Floating Point Exceptions have been reported - 4,107,098,137 cycles # 1.676 GHz - 6,336,202,498 instructions # 1.54 insn per cycle - 2.451096147 seconds time elapsed + 4,109,625,734 cycles # 1.679 GHz + 6,334,694,015 instructions # 1.54 insn per cycle + 2.448820329 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 92636e2555..638dc04e22 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:02:53 +DATE: 2024-06-02_21:29:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.094987e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.434034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.434034e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.091974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.434049e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.434049e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.523594 sec +TOTAL : 0.520202 sec INFO: No Floating Point Exceptions have been reported - 2,118,517,608 cycles # 2.813 GHz - 3,348,276,596 instructions # 1.58 insn per cycle - 0.813391390 seconds time elapsed + 2,115,997,044 cycles # 2.821 GHz + 3,356,177,989 instructions # 1.59 insn per cycle + 0.810142197 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.622834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.121853e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.121853e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.632273e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.129586e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.129586e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.329713 sec +TOTAL : 3.322225 sec INFO: No Floating Point Exceptions have been reported - 10,291,111,145 cycles # 2.854 GHz - 21,714,903,322 instructions # 2.11 insn per cycle - 3.660758937 seconds time elapsed + 10,271,397,573 cycles # 2.856 GHz + 22,004,537,141 instructions # 2.14 insn per cycle + 3.652772092 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.836126e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837051e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837051e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.833131e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.834044e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.834044e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.946025 sec +TOTAL : 8.960944 sec INFO: No Floating Point Exceptions have been reported - 25,625,027,072 cycles # 2.863 GHz - 78,943,584,564 instructions # 3.08 insn per cycle - 8.950491990 seconds time elapsed + 25,662,418,157 cycles # 2.863 GHz + 78,944,265,965 instructions # 3.08 insn per cycle + 8.965544443 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.512313e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515690e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515690e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.536360e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.539829e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.539829e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.682891 sec +TOTAL : 4.652216 sec INFO: No Floating Point Exceptions have been reported - 12,903,818,271 cycles # 2.754 GHz - 39,293,324,950 instructions # 3.05 insn per cycle - 4.687529036 seconds time elapsed + 12,900,905,409 cycles # 2.771 GHz + 39,296,118,040 instructions # 3.05 insn per cycle + 4.656875062 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.867831e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.884189e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.884189e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.851670e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.867737e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.867737e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.098804 sec +TOTAL : 2.103024 sec INFO: No Floating Point Exceptions have been reported - 5,587,651,201 cycles # 2.658 GHz - 13,696,262,775 instructions # 2.45 insn per cycle - 2.103410758 seconds time elapsed + 5,594,201,816 cycles # 2.655 GHz + 13,697,712,232 instructions # 2.45 insn per cycle + 2.107624615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.952196e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.973818e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.973818e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.921358e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.943578e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.943578e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.845912 sec +TOTAL : 1.852398 sec INFO: No Floating Point Exceptions have been reported - 4,903,860,646 cycles # 2.651 GHz - 12,352,108,328 instructions # 2.52 insn per cycle - 1.850421022 seconds time elapsed + 4,909,478,389 cycles # 2.645 GHz + 12,351,405,876 instructions # 2.52 insn per cycle + 1.857024744 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.711524e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.723541e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.723541e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.741453e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.753934e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.753934e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.457485 sec +TOTAL : 2.447091 sec INFO: No Floating Point Exceptions have been reported - 4,130,677,154 cycles # 1.678 GHz - 6,346,127,118 instructions # 1.54 insn per cycle - 2.462055019 seconds time elapsed + 4,126,055,402 cycles # 1.684 GHz + 6,345,698,997 instructions # 1.54 insn per cycle + 2.451723990 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 07bc3b6c73..79d60d2a9e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:12:42 +DATE: 2024-06-02_21:39:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.490501e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518177e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.520849e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.458071e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.487880e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.515008 sec +TOTAL : 0.516495 sec INFO: No Floating Point Exceptions have been reported - 2,117,861,647 cycles # 2.847 GHz - 3,355,581,223 instructions # 1.58 insn per cycle - 0.805282012 seconds time elapsed + 2,100,340,953 cycles # 2.817 GHz + 3,318,240,864 instructions # 1.58 insn per cycle + 0.807243807 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.120060e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.152876e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.154244e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.151628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182666e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.146209 sec +TOTAL : 3.126580 sec INFO: No Floating Point Exceptions have been reported - 9,794,350,225 cycles # 2.878 GHz - 20,567,996,876 instructions # 2.10 insn per cycle - 3.458179285 seconds time elapsed + 9,651,663,769 cycles # 2.855 GHz + 21,411,369,286 instructions # 2.22 insn per cycle + 3.436400390 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.854249e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.855163e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.855163e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.838040e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.838971e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.838971e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.856220 sec +TOTAL : 8.934375 sec INFO: No Floating Point Exceptions have been reported - 25,606,958,110 cycles # 2.890 GHz - 78,936,876,492 instructions # 3.08 insn per cycle - 8.860490718 seconds time elapsed + 25,608,373,833 cycles # 2.865 GHz + 78,937,451,677 instructions # 3.08 insn per cycle + 8.938566207 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.547585e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.550823e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.550823e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.525117e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.528335e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528335e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.633866 sec +TOTAL : 4.663427 sec INFO: No Floating Point Exceptions have been reported - 12,886,616,952 cycles # 2.779 GHz - 39,279,548,039 instructions # 3.05 insn per cycle - 4.638052623 seconds time elapsed + 12,892,979,270 cycles # 2.763 GHz + 39,279,722,056 instructions # 3.05 insn per cycle + 4.667648484 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.950793e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.966539e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.966539e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.833327e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.849001e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.849001e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.073967 sec +TOTAL : 2.104949 sec INFO: No Floating Point Exceptions have been reported - 5,577,712,569 cycles # 2.685 GHz - 13,684,498,611 instructions # 2.45 insn per cycle - 2.078154877 seconds time elapsed + 5,585,628,818 cycles # 2.649 GHz + 13,686,707,794 instructions # 2.45 insn per cycle + 2.109194961 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.068596e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.089664e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.089664e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.959882e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.980659e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.980659e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.820316 sec +TOTAL : 1.842075 sec INFO: No Floating Point Exceptions have been reported - 4,894,997,970 cycles # 2.684 GHz - 12,339,079,686 instructions # 2.52 insn per cycle - 1.824557454 seconds time elapsed + 4,892,059,435 cycles # 2.651 GHz + 12,339,041,510 instructions # 2.52 insn per cycle + 1.846337430 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.817590e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.829323e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.829323e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.749292e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.761080e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.761080e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.417091 sec +TOTAL : 2.441489 sec INFO: No Floating Point Exceptions have been reported - 4,131,104,953 cycles # 1.707 GHz - 6,332,486,091 instructions # 1.53 insn per cycle - 2.421265188 seconds time elapsed + 4,113,508,290 cycles # 1.683 GHz + 6,332,907,864 instructions # 1.54 insn per cycle + 2.445760630 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 0a65f9fefe..5745d06e17 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:09:55 +DATE: 2024-06-02_21:36:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.458490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485387e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.487802e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.461663e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.488298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.490692e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.511372 sec +TOTAL : 0.512096 sec INFO: No Floating Point Exceptions have been reported - 2,125,154,213 cycles # 2.818 GHz - 3,305,948,128 instructions # 1.56 insn per cycle - 0.811831996 seconds time elapsed + 2,118,130,475 cycles # 2.818 GHz + 3,288,416,689 instructions # 1.55 insn per cycle + 0.809966158 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.112497e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.145168e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.146541e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.134536e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.164023e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.165296e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.091906 sec +TOTAL : 3.077228 sec INFO: No Floating Point Exceptions have been reported - 9,555,297,501 cycles # 2.852 GHz - 20,467,928,496 instructions # 2.14 insn per cycle - 3.408325542 seconds time elapsed + 9,556,153,983 cycles # 2.858 GHz + 21,726,674,576 instructions # 2.27 insn per cycle + 3.399985544 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.835837e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.836698e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.836698e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.838183e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.839050e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.839050e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.943298 sec +TOTAL : 8.931733 sec INFO: No Floating Point Exceptions have been reported - 25,616,203,937 cycles # 2.864 GHz - 78,941,981,933 instructions # 3.08 insn per cycle - 8.947377666 seconds time elapsed + 25,607,680,070 cycles # 2.866 GHz + 78,937,604,302 instructions # 3.08 insn per cycle + 8.935953129 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.509219e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.512397e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512397e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.531271e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.534495e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534495e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.682859 sec +TOTAL : 4.653724 sec INFO: No Floating Point Exceptions have been reported - 12,889,261,061 cycles # 2.751 GHz - 39,280,374,746 instructions # 3.05 insn per cycle - 4.687212544 seconds time elapsed + 12,891,706,587 cycles # 2.769 GHz + 39,279,955,585 instructions # 3.05 insn per cycle + 4.658078583 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.873155e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.889044e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.889044e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.750526e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.765709e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.765709e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.092539 sec +TOTAL : 2.125753 sec INFO: No Floating Point Exceptions have been reported - 5,573,290,015 cycles # 2.659 GHz - 13,685,575,452 instructions # 2.46 insn per cycle - 2.096738730 seconds time elapsed + 5,636,716,298 cycles # 2.647 GHz + 13,685,667,157 instructions # 2.43 insn per cycle + 2.130052622 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.869803e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.890096e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.890096e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.975427e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.996737e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.996737e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.858817 sec +TOTAL : 1.836722 sec INFO: No Floating Point Exceptions have been reported - 4,890,594,740 cycles # 2.626 GHz - 12,341,872,390 instructions # 2.52 insn per cycle - 1.863321950 seconds time elapsed + 4,887,653,189 cycles # 2.656 GHz + 12,340,725,033 instructions # 2.52 insn per cycle + 1.841006928 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.721792e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.733746e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.733746e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.741757e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.753368e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.753368e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.449544 sec +TOTAL : 2.442506 sec INFO: No Floating Point Exceptions have been reported - 4,111,968,902 cycles # 1.676 GHz - 6,335,563,564 instructions # 1.54 insn per cycle - 2.453951471 seconds time elapsed + 4,119,197,476 cycles # 1.684 GHz + 6,334,707,467 instructions # 1.54 insn per cycle + 2.446806756 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index b300efd9c0..845fe92d47 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:07:12 +DATE: 2024-06-02_21:33:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.175456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.487401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.489887e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.175037e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.487201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.490172e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.516778 sec +TOTAL : 0.519308 sec INFO: No Floating Point Exceptions have been reported - 2,106,838,284 cycles # 2.817 GHz - 3,334,047,065 instructions # 1.58 insn per cycle - 0.806903831 seconds time elapsed + 2,132,091,270 cycles # 2.847 GHz + 3,400,172,905 instructions # 1.59 insn per cycle + 0.809359327 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -70,15 +70,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.725415e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181222e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182613e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.733086e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.182437e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.183712e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.208931 sec +TOTAL : 3.211390 sec INFO: No Floating Point Exceptions have been reported - 9,884,616,856 cycles # 2.852 GHz - 22,569,706,597 instructions # 2.28 insn per cycle - 3.521271497 seconds time elapsed + 9,891,188,972 cycles # 2.857 GHz + 21,285,655,080 instructions # 2.15 insn per cycle + 3.520480517 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -99,15 +99,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838805e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839662e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839662e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.838729e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.839597e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.839597e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.928794 sec +TOTAL : 8.929008 sec INFO: No Floating Point Exceptions have been reported - 25,578,535,475 cycles # 2.864 GHz - 78,941,438,017 instructions # 3.09 insn per cycle - 8.932959256 seconds time elapsed + 25,610,442,672 cycles # 2.867 GHz + 78,938,090,928 instructions # 3.08 insn per cycle + 8.933282594 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -127,15 +127,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.490750e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.493870e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.493870e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.506228e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.509410e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.509410e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.708141 sec +TOTAL : 4.687492 sec INFO: No Floating Point Exceptions have been reported - 12,873,433,154 cycles # 2.733 GHz - 39,280,620,994 instructions # 3.05 insn per cycle - 4.712353785 seconds time elapsed + 12,899,492,836 cycles # 2.750 GHz + 39,283,102,027 instructions # 3.05 insn per cycle + 4.691777787 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -155,15 +155,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.853620e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.869024e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.869024e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.752261e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.768022e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.768022e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.097850 sec +TOTAL : 2.125292 sec INFO: No Floating Point Exceptions have been reported - 5,573,477,429 cycles # 2.652 GHz - 13,685,909,410 instructions # 2.46 insn per cycle - 2.102047066 seconds time elapsed + 5,579,027,054 cycles # 2.621 GHz + 13,686,176,373 instructions # 2.45 insn per cycle + 2.129518303 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -183,15 +183,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.970623e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.991129e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.991129e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.957937e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.978146e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.978146e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.837793 sec +TOTAL : 1.840124 sec INFO: No Floating Point Exceptions have been reported - 4,885,535,539 cycles # 2.653 GHz - 12,340,762,979 instructions # 2.53 insn per cycle - 1.841998870 seconds time elapsed + 4,887,669,768 cycles # 2.651 GHz + 12,340,977,183 instructions # 2.52 insn per cycle + 1.844385583 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -211,15 +211,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.715803e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.727367e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.727367e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.740559e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.752226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.752226e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.451983 sec +TOTAL : 2.443234 sec INFO: No Floating Point Exceptions have been reported - 4,110,713,398 cycles # 1.674 GHz - 6,334,867,690 instructions # 1.54 insn per cycle - 2.456147392 seconds time elapsed + 4,110,489,538 cycles # 1.680 GHz + 6,334,661,219 instructions # 1.54 insn per cycle + 2.447553100 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 254c65fd8c..f7617fa14d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:40:17 +DATE: 2024-06-02_20:56:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.472040e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.495257e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497568e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.465250e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.491924e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494561e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.530193 sec +TOTAL : 0.531384 sec INFO: No Floating Point Exceptions have been reported - 2,179,825,483 cycles # 2.820 GHz - 3,416,926,116 instructions # 1.57 insn per cycle - 0.832303660 seconds time elapsed + 2,176,903,857 cycles # 2.812 GHz + 3,360,015,570 instructions # 1.54 insn per cycle + 0.832792471 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.149957e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.177471e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178689e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.144291e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180483e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.025570 sec +TOTAL : 3.021369 sec INFO: No Floating Point Exceptions have been reported - 9,343,829,120 cycles # 2.851 GHz - 20,017,847,921 instructions # 2.14 insn per cycle - 3.337093329 seconds time elapsed + 9,368,773,778 cycles # 2.857 GHz + 21,286,316,558 instructions # 2.27 insn per cycle + 3.335014178 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.844549e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.845438e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.845438e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.844326e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.845217e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.845217e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.900941 sec +TOTAL : 8.901599 sec INFO: No Floating Point Exceptions have been reported - 25,492,945,375 cycles # 2.863 GHz - 78,715,017,784 instructions # 3.09 insn per cycle - 8.905151100 seconds time elapsed + 25,466,417,477 cycles # 2.860 GHz + 78,709,901,314 instructions # 3.09 insn per cycle + 8.905880803 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.432714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.435728e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.435728e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.439967e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.443121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.443121e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.786568 sec +TOTAL : 4.776530 sec INFO: No Floating Point Exceptions have been reported - 12,968,671,480 cycles # 2.709 GHz - 39,227,279,421 instructions # 3.02 insn per cycle - 4.790848376 seconds time elapsed + 12,973,172,137 cycles # 2.714 GHz + 39,229,674,228 instructions # 3.02 insn per cycle + 4.780939174 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.791500e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.806568e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.806568e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.803639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.819236e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.819236e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.114108 sec +TOTAL : 2.110784 sec INFO: No Floating Point Exceptions have been reported - 5,617,875,214 cycles # 2.653 GHz - 13,801,216,605 instructions # 2.46 insn per cycle - 2.118326582 seconds time elapsed + 5,623,478,205 cycles # 2.660 GHz + 13,801,627,183 instructions # 2.45 insn per cycle + 2.115128076 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.808696e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.827867e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.827867e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.797294e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.817444e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.817444e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.871093 sec +TOTAL : 1.873371 sec INFO: No Floating Point Exceptions have been reported - 4,977,184,975 cycles # 2.656 GHz - 12,467,160,434 instructions # 2.50 insn per cycle - 1.875328468 seconds time elapsed + 4,983,137,076 cycles # 2.655 GHz + 12,465,949,717 instructions # 2.50 insn per cycle + 1.877612639 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.708154e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.719459e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.719459e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.717520e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.728878e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.728878e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.454314 sec +TOTAL : 2.451074 sec INFO: No Floating Point Exceptions have been reported - 4,118,637,907 cycles # 1.676 GHz - 6,458,862,875 instructions # 1.57 insn per cycle - 2.458530246 seconds time elapsed + 4,120,290,741 cycles # 1.679 GHz + 6,458,681,411 instructions # 1.57 insn per cycle + 2.455362823 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 452f4e853d..0fe5c16438 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:53:22 +DATE: 2024-06-02_21:19:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.253411e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.278108e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.280152e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.246479e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.270244e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.272477e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.538797 sec +TOTAL : 0.541806 sec INFO: No Floating Point Exceptions have been reported - 2,198,780,840 cycles # 2.857 GHz - 3,392,092,682 instructions # 1.54 insn per cycle - 0.826434194 seconds time elapsed + 2,171,468,423 cycles # 2.820 GHz + 3,387,210,535 instructions # 1.56 insn per cycle + 0.829471359 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.756018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.782691e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.783822e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.755733e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.780823e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.781841e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.316768 sec +TOTAL : 3.311285 sec INFO: No Floating Point Exceptions have been reported - 10,315,360,608 cycles # 2.881 GHz - 23,624,745,879 instructions # 2.29 insn per cycle - 3.638219909 seconds time elapsed + 10,187,766,661 cycles # 2.856 GHz + 22,069,205,702 instructions # 2.17 insn per cycle + 3.622898713 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.179521e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.179964e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.179964e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.133683e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.134122e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.134122e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.248920 sec +TOTAL : 39.685051 sec INFO: No Floating Point Exceptions have been reported - 113,511,319,041 cycles # 2.892 GHz - 144,820,446,927 instructions # 1.28 insn per cycle - 39.253177511 seconds time elapsed + 113,512,735,354 cycles # 2.860 GHz + 144,824,168,290 instructions # 1.28 insn per cycle + 39.689377450 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:21353) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.047626e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.050057e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.050057e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.009048e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.011479e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.011479e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.390362 sec +TOTAL : 5.460357 sec INFO: No Floating Point Exceptions have been reported - 14,740,564,650 cycles # 2.733 GHz - 37,575,494,329 instructions # 2.55 insn per cycle - 5.394647902 seconds time elapsed + 14,780,198,562 cycles # 2.706 GHz + 37,576,710,982 instructions # 2.54 insn per cycle + 5.464730306 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.230737e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.243892e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.243892e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.168780e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.181780e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.181780e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.278344 sec +TOTAL : 2.297592 sec INFO: No Floating Point Exceptions have been reported - 6,134,003,628 cycles # 2.689 GHz - 13,061,930,844 instructions # 2.13 insn per cycle - 2.282738143 seconds time elapsed + 6,127,083,636 cycles # 2.663 GHz + 13,063,845,546 instructions # 2.13 insn per cycle + 2.302025246 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.779670e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.799133e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.799133e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.695008e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.714356e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.714356e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.877722 sec +TOTAL : 1.895827 sec INFO: No Floating Point Exceptions have been reported - 5,068,047,565 cycles # 2.694 GHz - 11,440,450,267 instructions # 2.26 insn per cycle - 1.882139324 seconds time elapsed + 5,063,974,681 cycles # 2.666 GHz + 11,441,302,228 instructions # 2.26 insn per cycle + 1.900215464 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.093705e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.106755e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.106755e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.958952e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.971780e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.971780e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.321771 sec +TOTAL : 2.366561 sec INFO: No Floating Point Exceptions have been reported - 3,974,444,581 cycles # 1.709 GHz - 5,942,873,144 instructions # 1.50 insn per cycle - 2.326156002 seconds time elapsed + 3,976,192,244 cycles # 1.678 GHz + 5,945,001,398 instructions # 1.50 insn per cycle + 2.371015031 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 00ea23e18d..eab4a6ad11 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:54:29 +DATE: 2024-06-02_21:20:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.259147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.284136e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.286360e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.265532e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.290379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.292594e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.536782 sec +TOTAL : 0.539246 sec INFO: No Floating Point Exceptions have been reported - 2,193,506,190 cycles # 2.857 GHz - 3,337,314,407 instructions # 1.52 insn per cycle - 0.824492176 seconds time elapsed + 2,164,648,283 cycles # 2.819 GHz + 3,395,016,138 instructions # 1.57 insn per cycle + 0.825962894 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.761556e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.788263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.789425e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.767479e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.792561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.793589e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.301197 sec +TOTAL : 3.296186 sec INFO: No Floating Point Exceptions have been reported - 10,264,886,616 cycles # 2.886 GHz - 23,377,018,059 instructions # 2.28 insn per cycle - 3.615104997 seconds time elapsed + 10,182,455,226 cycles # 2.857 GHz + 22,751,947,545 instructions # 2.23 insn per cycle + 3.619847665 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.170908e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.171353e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.171353e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.098681e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.099131e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.099131e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.330349 sec +TOTAL : 40.023617 sec INFO: No Floating Point Exceptions have been reported - 113,688,017,774 cycles # 2.891 GHz - 144,788,018,158 instructions # 1.27 insn per cycle - 39.334720458 seconds time elapsed + 114,408,903,354 cycles # 2.859 GHz + 144,789,258,871 instructions # 1.27 insn per cycle + 40.028023083 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.974783e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.977013e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.977013e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.944993e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.947319e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.947319e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.522774 sec +TOTAL : 5.578054 sec INFO: No Floating Point Exceptions have been reported - 15,220,566,650 cycles # 2.755 GHz - 37,763,046,074 instructions # 2.48 insn per cycle - 5.527045303 seconds time elapsed + 15,223,576,233 cycles # 2.728 GHz + 37,762,970,352 instructions # 2.48 insn per cycle + 5.582406080 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.412795e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.426610e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.426610e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.278833e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.292510e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.292510e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.222010 sec +TOTAL : 2.262588 sec INFO: No Floating Point Exceptions have been reported - 6,000,419,836 cycles # 2.696 GHz - 12,896,174,142 instructions # 2.15 insn per cycle - 2.226315650 seconds time elapsed + 6,007,020,457 cycles # 2.651 GHz + 12,896,115,872 instructions # 2.15 insn per cycle + 2.266904685 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.743711e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.762861e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.762861e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.679346e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.698475e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.698475e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.885038 sec +TOTAL : 1.899294 sec INFO: No Floating Point Exceptions have been reported - 5,086,798,971 cycles # 2.694 GHz - 11,447,968,989 instructions # 2.25 insn per cycle - 1.889284279 seconds time elapsed + 5,094,216,811 cycles # 2.677 GHz + 11,448,333,625 instructions # 2.25 insn per cycle + 1.903608667 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.141072e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.153903e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.153903e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.004138e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.016815e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.016815e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.306249 sec +TOTAL : 2.351445 sec INFO: No Floating Point Exceptions have been reported - 3,947,559,408 cycles # 1.709 GHz - 5,896,754,674 instructions # 1.49 insn per cycle - 2.310527958 seconds time elapsed + 3,952,954,226 cycles # 1.679 GHz + 5,896,992,592 instructions # 1.49 insn per cycle + 2.356043721 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 15bbe59069..fac6650d6a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:40:50 +DATE: 2024-06-02_20:57:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.326887e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.370559e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.376371e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.356553e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.410019e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.415334e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487782 sec +TOTAL : 0.487456 sec INFO: No Floating Point Exceptions have been reported - 1,984,813,926 cycles # 2.807 GHz - 2,933,686,219 instructions # 1.48 insn per cycle - 0.764328783 seconds time elapsed + 1,983,251,875 cycles # 2.820 GHz + 2,928,209,714 instructions # 1.48 insn per cycle + 0.764609113 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.584549e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.644337e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.647136e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.608392e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.688100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.691532e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.720884 sec +TOTAL : 1.725977 sec INFO: No Floating Point Exceptions have been reported - 5,560,374,951 cycles # 2.849 GHz - 11,900,809,748 instructions # 2.14 insn per cycle - 2.008088048 seconds time elapsed + 5,578,929,082 cycles # 2.849 GHz + 11,014,391,874 instructions # 1.97 insn per cycle + 2.015111587 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.909633e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.910547e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.910547e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.908177e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.909097e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.909097e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.596251 sec +TOTAL : 8.603316 sec INFO: No Floating Point Exceptions have been reported - 24,624,004,022 cycles # 2.864 GHz - 78,129,381,217 instructions # 3.17 insn per cycle - 8.600293639 seconds time elapsed + 24,630,404,396 cycles # 2.862 GHz + 78,128,784,942 instructions # 3.17 insn per cycle + 8.607433386 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.891953e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.904635e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.904635e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.834941e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.847074e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.847074e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.387612 sec +TOTAL : 2.408187 sec INFO: No Floating Point Exceptions have been reported - 6,469,659,104 cycles # 2.706 GHz - 20,120,611,338 instructions # 3.11 insn per cycle - 2.391816623 seconds time elapsed + 6,475,075,186 cycles # 2.685 GHz + 20,120,578,414 instructions # 3.11 insn per cycle + 2.412367232 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.562010e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568248e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568248e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.547510e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.553645e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.553645e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.059004 sec +TOTAL : 1.067919 sec INFO: No Floating Point Exceptions have been reported - 2,818,181,262 cycles # 2.654 GHz - 6,988,460,270 instructions # 2.48 insn per cycle - 1.063195979 seconds time elapsed + 2,818,351,962 cycles # 2.631 GHz + 6,988,245,481 instructions # 2.48 insn per cycle + 1.072071344 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.763183e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.771185e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.771185e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.757892e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.765838e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.765838e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.938662 sec +TOTAL : 0.941407 sec INFO: No Floating Point Exceptions have been reported - 2,488,393,509 cycles # 2.641 GHz - 6,295,244,635 instructions # 2.53 insn per cycle - 0.942828770 seconds time elapsed + 2,493,554,219 cycles # 2.639 GHz + 6,295,971,949 instructions # 2.52 insn per cycle + 0.945627547 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.363218e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.368048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.368048e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.363379e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.368145e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.368145e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.211358 sec +TOTAL : 1.211254 sec INFO: No Floating Point Exceptions have been reported - 2,044,658,355 cycles # 1.683 GHz - 3,265,998,063 instructions # 1.60 insn per cycle - 1.215542758 seconds time elapsed + 2,046,343,979 cycles # 1.685 GHz + 3,265,913,971 instructions # 1.60 insn per cycle + 1.215412846 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index e281ad389f..bcf7be18e6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:03:26 +DATE: 2024-06-02_21:29:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.615502e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.322427e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.322427e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.598474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.301712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.301712e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.477115 sec +TOTAL : 0.472439 sec INFO: No Floating Point Exceptions have been reported - 1,936,349,619 cycles # 2.809 GHz - 2,877,179,431 instructions # 1.49 insn per cycle - 0.747561501 seconds time elapsed + 1,958,040,860 cycles # 2.820 GHz + 2,939,762,551 instructions # 1.50 insn per cycle + 0.750896192 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.243623e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.556013e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.556013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.273826e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.571327e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.571327e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.907816 sec +TOTAL : 1.901552 sec INFO: No Floating Point Exceptions have been reported - 6,131,638,198 cycles # 2.845 GHz - 12,981,768,605 instructions # 2.12 insn per cycle - 2.213144159 seconds time elapsed + 6,095,137,537 cycles # 2.849 GHz + 12,940,780,690 instructions # 2.12 insn per cycle + 2.195795529 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.909165e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.910120e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.910120e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.910575e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.911506e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.911506e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.600727 sec +TOTAL : 8.594991 sec INFO: No Floating Point Exceptions have been reported - 24,637,778,479 cycles # 2.864 GHz - 78,132,610,249 instructions # 3.17 insn per cycle - 8.604942209 seconds time elapsed + 24,652,141,911 cycles # 2.867 GHz + 78,137,160,167 instructions # 3.17 insn per cycle + 8.599344489 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.457452e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.468775e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.468775e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.898082e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.911105e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.911105e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.550311 sec +TOTAL : 2.388247 sec INFO: No Floating Point Exceptions have been reported - 6,935,275,139 cycles # 2.716 GHz - 20,130,100,658 instructions # 2.90 insn per cycle - 2.554710358 seconds time elapsed + 6,478,062,029 cycles # 2.708 GHz + 20,129,777,692 instructions # 3.11 insn per cycle + 2.392637226 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.550561e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.557160e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.557160e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.558890e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565336e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565336e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.068476 sec +TOTAL : 1.063397 sec INFO: No Floating Point Exceptions have been reported - 2,830,711,742 cycles # 2.640 GHz - 6,997,830,070 instructions # 2.47 insn per cycle - 1.072903816 seconds time elapsed + 2,829,584,006 cycles # 2.652 GHz + 6,998,429,462 instructions # 2.47 insn per cycle + 1.067880984 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.772339e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.780808e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.780808e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.773701e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782227e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.782227e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.936387 sec +TOTAL : 0.935665 sec INFO: No Floating Point Exceptions have been reported - 2,497,824,247 cycles # 2.658 GHz - 6,305,168,616 instructions # 2.52 insn per cycle - 0.940674173 seconds time elapsed + 2,499,925,740 cycles # 2.661 GHz + 6,304,962,307 instructions # 2.52 insn per cycle + 0.940287567 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.362852e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.367803e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367803e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.365132e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.370126e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.370126e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.214531 sec +TOTAL : 1.212746 sec INFO: No Floating Point Exceptions have been reported - 2,054,265,568 cycles # 1.686 GHz - 3,276,400,100 instructions # 1.59 insn per cycle - 1.218830996 seconds time elapsed + 2,057,028,781 cycles # 1.691 GHz + 3,276,379,459 instructions # 1.59 insn per cycle + 1.217212255 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 1c3846a692..b890671a07 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:13:15 +DATE: 2024-06-02_21:39:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.362325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.415082e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.420724e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.323640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.375162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.380930e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.471632 sec +TOTAL : 0.470190 sec INFO: No Floating Point Exceptions have been reported - 1,968,514,245 cycles # 2.841 GHz - 2,902,581,432 instructions # 1.47 insn per cycle - 0.750206216 seconds time elapsed + 1,954,941,965 cycles # 2.821 GHz + 2,911,153,946 instructions # 1.49 insn per cycle + 0.750497586 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.620229e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.693332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.696842e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.584082e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.652641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.655733e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.810763 sec +TOTAL : 1.807227 sec INFO: No Floating Point Exceptions have been reported - 5,856,364,996 cycles # 2.870 GHz - 12,360,478,892 instructions # 2.11 insn per cycle - 2.100167053 seconds time elapsed + 5,802,713,802 cycles # 2.849 GHz + 11,535,404,543 instructions # 1.99 insn per cycle + 2.092606216 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.921942e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.922884e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.922884e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.913976e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.914935e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914935e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.542686 sec +TOTAL : 8.577750 sec INFO: No Floating Point Exceptions have been reported - 24,622,493,732 cycles # 2.881 GHz - 78,127,963,456 instructions # 3.17 insn per cycle - 8.546707601 seconds time elapsed + 24,611,965,012 cycles # 2.868 GHz + 78,127,241,298 instructions # 3.17 insn per cycle + 8.581839983 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.925135e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.937766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.937766e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.890077e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.902996e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.902996e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.378023 sec +TOTAL : 2.389920 sec INFO: No Floating Point Exceptions have been reported - 6,481,719,151 cycles # 2.722 GHz - 20,120,720,773 instructions # 3.10 insn per cycle - 2.382079719 seconds time elapsed + 6,479,004,824 cycles # 2.707 GHz + 20,120,753,195 instructions # 3.11 insn per cycle + 2.394143360 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.581338e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.587855e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.587855e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.544733e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.550971e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.550971e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.046370 sec +TOTAL : 1.071318 sec INFO: No Floating Point Exceptions have been reported - 2,822,358,408 cycles # 2.688 GHz - 6,985,542,199 instructions # 2.48 insn per cycle - 1.050425346 seconds time elapsed + 2,822,669,649 cycles # 2.626 GHz + 6,987,403,130 instructions # 2.48 insn per cycle + 1.075405956 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.806198e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.814674e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.814674e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.762670e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.770899e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.770899e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.917862 sec +TOTAL : 0.940633 sec INFO: No Floating Point Exceptions have been reported - 2,496,110,223 cycles # 2.709 GHz - 6,293,657,033 instructions # 2.52 insn per cycle - 0.921934399 seconds time elapsed + 2,495,155,242 cycles # 2.643 GHz + 6,294,152,477 instructions # 2.52 insn per cycle + 0.944657528 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.393764e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.398765e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.398765e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.367006e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.372042e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.372042e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.186576 sec +TOTAL : 1.209697 sec INFO: No Floating Point Exceptions have been reported - 2,050,577,153 cycles # 1.723 GHz - 3,264,219,053 instructions # 1.59 insn per cycle - 1.190613213 seconds time elapsed + 2,049,421,235 cycles # 1.690 GHz + 3,264,511,946 instructions # 1.59 insn per cycle + 1.213845698 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 97148e3ba7..d9b7ee3321 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:10:28 +DATE: 2024-06-02_21:36:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.326390e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.378340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.384052e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.354433e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.405954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.411991e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.468432 sec +TOTAL : 0.469295 sec INFO: No Floating Point Exceptions have been reported - 1,956,153,885 cycles # 2.820 GHz - 2,925,124,547 instructions # 1.50 insn per cycle - 0.750741002 seconds time elapsed + 1,931,447,531 cycles # 2.818 GHz + 2,857,411,874 instructions # 1.48 insn per cycle + 0.742623214 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.616852e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.690868e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.694290e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.583355e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.650562e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.653652e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.758835 sec +TOTAL : 1.754167 sec INFO: No Floating Point Exceptions have been reported - 5,694,632,258 cycles # 2.846 GHz - 12,170,382,669 instructions # 2.14 insn per cycle - 2.057387110 seconds time elapsed + 5,647,206,768 cycles # 2.850 GHz + 12,402,873,577 instructions # 2.20 insn per cycle + 2.038447332 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.910170e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.911121e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.911121e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.908988e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.909932e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.909932e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.594641 sec +TOTAL : 8.599963 sec INFO: No Floating Point Exceptions have been reported - 24,610,525,016 cycles # 2.863 GHz - 78,132,278,540 instructions # 3.17 insn per cycle - 8.598723021 seconds time elapsed + 24,610,286,619 cycles # 2.861 GHz + 78,133,539,217 instructions # 3.17 insn per cycle + 8.604049688 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.888566e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.900956e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.900956e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.879217e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.891579e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891579e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.388862 sec +TOTAL : 2.392369 sec INFO: No Floating Point Exceptions have been reported - 6,476,954,136 cycles # 2.708 GHz - 20,121,920,046 instructions # 3.11 insn per cycle - 2.393015096 seconds time elapsed + 6,476,796,255 cycles # 2.704 GHz + 20,121,504,943 instructions # 3.11 insn per cycle + 2.396533403 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.562187e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568435e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568435e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.559523e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565982e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565982e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.057977 sec +TOTAL : 1.059899 sec INFO: No Floating Point Exceptions have been reported - 2,818,730,747 cycles # 2.656 GHz - 6,988,428,853 instructions # 2.48 insn per cycle - 1.062013241 seconds time elapsed + 2,823,568,100 cycles # 2.655 GHz + 6,988,803,220 instructions # 2.48 insn per cycle + 1.064076387 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.767100e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.775059e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.775059e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.763562e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.771848e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.771848e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.936311 sec +TOTAL : 0.938280 sec INFO: No Floating Point Exceptions have been reported - 2,487,367,369 cycles # 2.647 GHz - 6,295,352,067 instructions # 2.53 insn per cycle - 0.940344403 seconds time elapsed + 2,491,711,137 cycles # 2.646 GHz + 6,295,398,273 instructions # 2.53 insn per cycle + 0.942474198 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.354758e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.359575e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.359575e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.365021e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.369781e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.369781e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.218960 sec +TOTAL : 1.209851 sec INFO: No Floating Point Exceptions have been reported - 2,057,435,423 cycles # 1.683 GHz - 3,266,628,935 instructions # 1.59 insn per cycle - 1.223151915 seconds time elapsed + 2,048,944,436 cycles # 1.689 GHz + 3,266,101,120 instructions # 1.59 insn per cycle + 1.214099702 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index dc12ca7aae..ae89ba0a21 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_15:07:45 +DATE: 2024-06-02_21:34:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.747793e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.405382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.411341e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.743567e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.402148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.408206e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.473580 sec +TOTAL : 0.471325 sec INFO: No Floating Point Exceptions have been reported - 1,929,031,590 cycles # 2.811 GHz - 2,902,080,173 instructions # 1.50 insn per cycle - 0.744461149 seconds time elapsed + 1,966,995,104 cycles # 2.818 GHz + 2,861,570,291 instructions # 1.45 insn per cycle + 0.755170556 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -70,15 +70,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.464876e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.690964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.694375e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.483014e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.687507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.690776e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.841417 sec +TOTAL : 1.838348 sec INFO: No Floating Point Exceptions have been reported - 5,892,322,421 cycles # 2.846 GHz - 12,206,550,799 instructions # 2.07 insn per cycle - 2.128532659 seconds time elapsed + 5,879,672,141 cycles # 2.848 GHz + 11,762,772,012 instructions # 2.00 insn per cycle + 2.123420033 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -99,15 +99,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.911509e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912427e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912427e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.911656e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.912578e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912578e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.588000 sec +TOTAL : 8.587423 sec INFO: No Floating Point Exceptions have been reported - 24,603,486,303 cycles # 2.864 GHz - 78,128,844,221 instructions # 3.18 insn per cycle - 8.592028071 seconds time elapsed + 24,606,124,860 cycles # 2.865 GHz + 78,133,915,634 instructions # 3.18 insn per cycle + 8.591534118 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -127,15 +127,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.897521e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.909886e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.909886e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.884793e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.897138e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.897138e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.385730 sec +TOTAL : 2.390128 sec INFO: No Floating Point Exceptions have been reported - 6,477,077,766 cycles # 2.711 GHz - 20,121,628,941 instructions # 3.11 insn per cycle - 2.389950461 seconds time elapsed + 6,472,493,425 cycles # 2.704 GHz + 20,120,111,462 instructions # 3.11 insn per cycle + 2.394291914 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -155,15 +155,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.564279e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.570570e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.570570e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.539420e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.545531e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.545531e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.056604 sec +TOTAL : 1.073549 sec INFO: No Floating Point Exceptions have been reported - 2,817,814,854 cycles # 2.658 GHz - 6,988,003,654 instructions # 2.48 insn per cycle - 1.060745031 seconds time elapsed + 2,823,771,290 cycles # 2.622 GHz + 6,988,702,898 instructions # 2.47 insn per cycle + 1.077710790 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -183,15 +183,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.769248e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.777272e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.777272e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.753139e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.761048e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.761048e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.935150 sec +TOTAL : 0.944047 sec INFO: No Floating Point Exceptions have been reported - 2,489,664,656 cycles # 2.652 GHz - 6,295,373,565 instructions # 2.53 insn per cycle - 0.939255376 seconds time elapsed + 2,494,675,517 cycles # 2.633 GHz + 6,296,230,342 instructions # 2.52 insn per cycle + 0.948274903 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -211,15 +211,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.359069e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.363945e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.363945e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.362634e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.367367e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367367e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.214973 sec +TOTAL : 1.211984 sec INFO: No Floating Point Exceptions have been reported - 2,049,104,437 cycles # 1.682 GHz - 3,266,431,248 instructions # 1.59 insn per cycle - 1.219018056 seconds time elapsed + 2,049,642,332 cycles # 1.687 GHz + 3,266,281,603 instructions # 1.59 insn per cycle + 1.216098332 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index fb9b3d5f50..2894e34cf4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:41:16 +DATE: 2024-06-02_20:57:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.355693e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.401036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.406994e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.333854e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.386711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.394716e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.484635 sec +TOTAL : 0.491038 sec INFO: No Floating Point Exceptions have been reported - 1,972,569,534 cycles # 2.816 GHz - 2,939,499,932 instructions # 1.49 insn per cycle - 0.757402101 seconds time elapsed + 1,996,644,525 cycles # 2.816 GHz + 2,959,911,635 instructions # 1.48 insn per cycle + 0.767468034 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.619113e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.679355e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.682149e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.597534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.679033e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.682352e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.713739 sec +TOTAL : 1.726608 sec INFO: No Floating Point Exceptions have been reported - 5,540,767,327 cycles # 2.848 GHz - 11,699,037,597 instructions # 2.11 insn per cycle - 2.001424634 seconds time elapsed + 5,570,117,632 cycles # 2.845 GHz + 11,507,962,990 instructions # 2.07 insn per cycle + 2.016820610 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.914726e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.915647e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.915647e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915894e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.916848e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916848e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.573445 sec +TOTAL : 8.568155 sec INFO: No Floating Point Exceptions have been reported - 24,558,709,341 cycles # 2.863 GHz - 77,854,833,330 instructions # 3.17 insn per cycle - 8.577561930 seconds time elapsed + 24,541,635,551 cycles # 2.863 GHz + 77,860,582,476 instructions # 3.17 insn per cycle + 8.572306446 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.975982e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.989277e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.989277e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.989453e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.002228e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.002228e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.359034 sec +TOTAL : 2.354308 sec INFO: No Floating Point Exceptions have been reported - 6,427,044,365 cycles # 2.721 GHz - 20,086,102,386 instructions # 3.13 insn per cycle - 2.363343503 seconds time elapsed + 6,429,555,449 cycles # 2.727 GHz + 20,085,437,100 instructions # 3.12 insn per cycle + 2.358577961 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.504468e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510257e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.510257e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.495665e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.501413e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.501413e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.097904 sec +TOTAL : 1.104553 sec INFO: No Floating Point Exceptions have been reported - 2,922,790,348 cycles # 2.654 GHz - 7,129,934,034 instructions # 2.44 insn per cycle - 1.101954791 seconds time elapsed + 2,915,650,989 cycles # 2.631 GHz + 7,129,883,095 instructions # 2.45 insn per cycle + 1.108735215 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.699271e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.706623e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.706623e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.680175e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.687445e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.687445e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.973114 sec +TOTAL : 0.984092 sec INFO: No Floating Point Exceptions have been reported - 2,595,556,237 cycles # 2.658 GHz - 6,438,662,691 instructions # 2.48 insn per cycle - 0.977341866 seconds time elapsed + 2,594,901,147 cycles # 2.627 GHz + 6,438,491,817 instructions # 2.48 insn per cycle + 0.988307717 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.316379e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.321013e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321013e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.316604e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.321189e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.321189e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.254012 sec +TOTAL : 1.253756 sec INFO: No Floating Point Exceptions have been reported - 2,116,081,195 cycles # 1.683 GHz - 3,427,806,501 instructions # 1.62 insn per cycle - 1.258282002 seconds time elapsed + 2,120,187,945 cycles # 1.687 GHz + 3,427,717,458 instructions # 1.62 insn per cycle + 1.258078621 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2924) (512y: 22) (512z: 9654) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 3f8f67a608..f9728316f5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:55:36 +DATE: 2024-06-02_21:22:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.546626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.586411e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.590653e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.562593e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.602761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.607258e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.493388 sec +TOTAL : 0.496319 sec INFO: No Floating Point Exceptions have been reported - 2,052,942,224 cycles # 2.851 GHz - 3,071,897,705 instructions # 1.50 insn per cycle - 0.778091403 seconds time elapsed + 2,073,990,194 cycles # 2.815 GHz + 3,053,942,926 instructions # 1.47 insn per cycle + 0.794515563 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.711232e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.769137e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.771784e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.711536e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.769520e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.772095e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.862053 sec +TOTAL : 1.869015 sec INFO: No Floating Point Exceptions have been reported - 6,039,204,312 cycles # 2.872 GHz - 11,937,016,347 instructions # 1.98 insn per cycle - 2.158356809 seconds time elapsed + 6,036,319,674 cycles # 2.845 GHz + 12,339,909,244 instructions # 2.04 insn per cycle + 2.177861676 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.454006e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.454774e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.454774e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.442190e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.442945e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.442945e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 30.077717 sec +TOTAL : 30.143467 sec INFO: No Floating Point Exceptions have been reported - 86,228,096,895 cycles # 2.867 GHz - 135,581,749,205 instructions # 1.57 insn per cycle - 30.081848617 seconds time elapsed + 86,238,493,092 cycles # 2.861 GHz + 135,582,429,521 instructions # 1.57 insn per cycle + 30.147691683 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:15593) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.767198e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.779100e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.779100e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.622664e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.634101e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.634101e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.432356 sec +TOTAL : 2.485418 sec INFO: No Floating Point Exceptions have been reported - 6,776,462,064 cycles # 2.783 GHz - 19,386,992,522 instructions # 2.86 insn per cycle - 2.436630257 seconds time elapsed + 6,780,106,144 cycles # 2.725 GHz + 19,386,070,044 instructions # 2.86 insn per cycle + 2.489679561 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.415254e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.420302e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.420302e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.375981e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380844e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380844e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.167177 sec +TOTAL : 1.200183 sec INFO: No Floating Point Exceptions have been reported - 3,174,327,264 cycles # 2.711 GHz - 6,807,988,001 instructions # 2.14 insn per cycle - 1.171487938 seconds time elapsed + 3,187,701,871 cycles # 2.648 GHz + 6,807,898,152 instructions # 2.14 insn per cycle + 1.204433728 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.702865e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.710950e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.710950e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.667985e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.675164e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.675164e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.971352 sec +TOTAL : 0.991406 sec INFO: No Floating Point Exceptions have been reported - 2,641,359,018 cycles # 2.709 GHz - 5,985,956,533 instructions # 2.27 insn per cycle - 0.975633569 seconds time elapsed + 2,635,968,315 cycles # 2.649 GHz + 5,985,925,835 instructions # 2.27 insn per cycle + 0.995667999 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.382138e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.387143e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.387143e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.343303e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.347932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.347932e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.195051 sec +TOTAL : 1.229182 sec INFO: No Floating Point Exceptions have been reported - 2,079,765,601 cycles # 1.735 GHz - 3,501,460,071 instructions # 1.68 insn per cycle - 1.199295448 seconds time elapsed + 2,077,787,049 cycles # 1.685 GHz + 3,500,922,258 instructions # 1.68 insn per cycle + 1.233472370 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5210) (512y: 3) (512z:44829) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index f651d28060..a6e1efe771 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:56:25 +DATE: 2024-06-02_21:22:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.511551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.549792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.554590e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.474201e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.511992e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.516380e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.495451 sec +TOTAL : 0.495445 sec INFO: No Floating Point Exceptions have been reported - 2,081,587,427 cycles # 2.833 GHz - 3,058,350,902 instructions # 1.47 insn per cycle - 0.791962629 seconds time elapsed + 2,069,875,918 cycles # 2.815 GHz + 3,088,038,049 instructions # 1.49 insn per cycle + 0.793826258 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.609535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.664835e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.667390e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.633274e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.688888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.691415e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.882393 sec +TOTAL : 1.883548 sec INFO: No Floating Point Exceptions have been reported - 6,108,387,546 cycles # 2.880 GHz - 12,340,826,531 instructions # 2.02 insn per cycle - 2.177538628 seconds time elapsed + 6,034,409,459 cycles # 2.847 GHz + 12,602,199,510 instructions # 2.09 insn per cycle + 2.178280060 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.501753e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.502516e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.502516e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.435127e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.435900e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.435900e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.816879 sec +TOTAL : 30.183033 sec INFO: No Floating Point Exceptions have been reported - 86,332,810,324 cycles # 2.895 GHz - 136,005,056,328 instructions # 1.58 insn per cycle - 29.820915946 seconds time elapsed + 86,348,373,585 cycles # 2.861 GHz + 135,991,147,369 instructions # 1.57 insn per cycle + 30.187286457 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:15571) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.665500e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.677672e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.677672e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.576661e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.588266e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.588266e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.468426 sec +TOTAL : 2.501741 sec INFO: No Floating Point Exceptions have been reported - 6,845,942,231 cycles # 2.769 GHz - 19,438,050,467 instructions # 2.84 insn per cycle - 2.472841886 seconds time elapsed + 6,860,063,616 cycles # 2.739 GHz + 19,439,732,968 instructions # 2.83 insn per cycle + 2.505990169 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.417376e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.422501e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.422501e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.407660e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.412779e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.412779e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.165327 sec +TOTAL : 1.173483 sec INFO: No Floating Point Exceptions have been reported - 3,124,352,057 cycles # 2.673 GHz - 6,718,803,660 instructions # 2.15 insn per cycle - 1.169556736 seconds time elapsed + 3,113,664,715 cycles # 2.645 GHz + 6,718,777,649 instructions # 2.16 insn per cycle + 1.177759140 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.691458e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.698947e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.698947e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.667132e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.674306e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.674306e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.977716 sec +TOTAL : 0.991992 sec INFO: No Floating Point Exceptions have been reported - 2,637,169,918 cycles # 2.688 GHz - 5,969,286,098 instructions # 2.26 insn per cycle - 0.981942660 seconds time elapsed + 2,638,087,053 cycles # 2.650 GHz + 5,969,912,308 instructions # 2.26 insn per cycle + 0.996231534 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.359514e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.364207e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.364207e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.342386e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.347018e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.347018e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.214640 sec +TOTAL : 1.229963 sec INFO: No Floating Point Exceptions have been reported - 2,077,190,375 cycles # 1.705 GHz - 3,494,266,618 instructions # 1.68 insn per cycle - 1.219012886 seconds time elapsed + 2,078,559,427 cycles # 1.687 GHz + 3,494,531,487 instructions # 1.68 insn per cycle + 1.234278676 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4174) (512y: 4) (512z:44472) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 19b36f52e3..7c14a2e7fb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:41:41 +DATE: 2024-06-02_20:58:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.461685e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.487571e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.482581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.509636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512012e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529512 sec +TOTAL : 0.527538 sec INFO: No Floating Point Exceptions have been reported - 2,178,837,447 cycles # 2.822 GHz - 3,364,663,947 instructions # 1.54 insn per cycle - 0.830788537 seconds time elapsed + 2,176,696,799 cycles # 2.823 GHz + 3,403,965,501 instructions # 1.56 insn per cycle + 0.831403987 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.126264e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.153475e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.154649e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.140636e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.174184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.175593e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.029127 sec +TOTAL : 3.042724 sec INFO: No Floating Point Exceptions have been reported - 9,370,132,482 cycles # 2.854 GHz - 19,961,685,193 instructions # 2.13 insn per cycle - 3.339329204 seconds time elapsed + 9,426,429,638 cycles # 2.856 GHz + 21,229,330,812 instructions # 2.25 insn per cycle + 3.355690618 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.814801e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.815638e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.815638e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.825999e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.826873e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826873e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.046315 sec +TOTAL : 8.991087 sec INFO: No Floating Point Exceptions have been reported - 25,898,843,827 cycles # 2.862 GHz - 79,438,691,532 instructions # 3.07 insn per cycle - 9.050548799 seconds time elapsed + 25,893,198,103 cycles # 2.879 GHz + 79,438,485,543 instructions # 3.07 insn per cycle + 8.995289929 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.427576e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.430583e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.430583e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.417587e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.420701e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.420701e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.794109 sec +TOTAL : 4.808506 sec INFO: No Floating Point Exceptions have been reported - 12,707,110,349 cycles # 2.649 GHz - 38,549,995,901 instructions # 3.03 insn per cycle - 4.798432518 seconds time elapsed + 12,725,370,972 cycles # 2.645 GHz + 38,549,760,913 instructions # 3.03 insn per cycle + 4.812856911 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.947160e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.962973e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.962973e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.894022e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.909715e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.909715e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.073357 sec +TOTAL : 2.087240 sec INFO: No Floating Point Exceptions have been reported - 5,517,673,615 cycles # 2.658 GHz - 13,479,814,632 instructions # 2.44 insn per cycle - 2.077628129 seconds time elapsed + 5,528,399,244 cycles # 2.644 GHz + 13,481,627,455 instructions # 2.44 insn per cycle + 2.091666528 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.092853e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.113453e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.113453e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.891097e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.911587e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.911587e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.813039 sec +TOTAL : 1.854687 sec INFO: No Floating Point Exceptions have been reported - 4,828,852,439 cycles # 2.658 GHz - 12,135,084,334 instructions # 2.51 insn per cycle - 1.817332368 seconds time elapsed + 4,870,728,509 cycles # 2.622 GHz + 12,137,042,883 instructions # 2.49 insn per cycle + 1.858964728 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.671146e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.682190e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.682190e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.691394e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.702803e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.702803e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.468744 sec +TOTAL : 2.460755 sec INFO: No Floating Point Exceptions have been reported - 4,141,507,976 cycles # 1.676 GHz - 6,337,241,929 instructions # 1.53 insn per cycle - 2.472886901 seconds time elapsed + 4,149,120,121 cycles # 1.684 GHz + 6,337,745,344 instructions # 1.53 insn per cycle + 2.465117818 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 10c707e81e..1d3301fafa 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-16_14:42:15 +DATE: 2024-06-02_20:58:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.482619e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.506518e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.508744e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.467668e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496432e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.530792 sec +TOTAL : 0.527278 sec INFO: No Floating Point Exceptions have been reported - 2,175,803,522 cycles # 2.817 GHz - 3,378,965,043 instructions # 1.55 insn per cycle - 0.832396723 seconds time elapsed + 2,157,795,556 cycles # 2.825 GHz + 3,403,572,710 instructions # 1.58 insn per cycle + 0.822587991 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.148343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.175835e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.177034e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.147774e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181635e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182999e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.021680 sec +TOTAL : 3.025468 sec INFO: No Floating Point Exceptions have been reported - 9,373,127,330 cycles # 2.855 GHz - 21,008,547,067 instructions # 2.24 insn per cycle - 3.339100414 seconds time elapsed + 9,383,589,958 cycles # 2.858 GHz + 21,474,425,018 instructions # 2.29 insn per cycle + 3.338591075 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.816314e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.817146e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.817146e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.819625e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.820503e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.820503e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.038908 sec +TOTAL : 9.022232 sec INFO: No Floating Point Exceptions have been reported - 25,883,565,851 cycles # 2.863 GHz - 79,454,182,113 instructions # 3.07 insn per cycle - 9.043041112 seconds time elapsed + 25,882,405,875 cycles # 2.868 GHz + 79,448,983,201 instructions # 3.07 insn per cycle + 9.026481543 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.444359e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.447378e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.447378e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.446182e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.449327e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.449327e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.770841 sec +TOTAL : 4.768299 sec INFO: No Floating Point Exceptions have been reported - 12,673,930,469 cycles # 2.656 GHz - 38,521,208,960 instructions # 3.04 insn per cycle - 4.775036357 seconds time elapsed + 12,681,708,725 cycles # 2.658 GHz + 38,523,479,653 instructions # 3.04 insn per cycle + 4.772643197 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.869925e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.885399e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.885399e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.783293e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.799085e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.799085e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.093382 sec +TOTAL : 2.116349 sec INFO: No Floating Point Exceptions have been reported - 5,571,627,209 cycles # 2.657 GHz - 13,607,217,607 instructions # 2.44 insn per cycle - 2.097652206 seconds time elapsed + 5,573,346,630 cycles # 2.629 GHz + 13,607,371,055 instructions # 2.44 insn per cycle + 2.120654201 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.920636e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.941759e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.941759e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.950987e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.971335e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.971335e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.847928 sec +TOTAL : 1.841569 sec INFO: No Floating Point Exceptions have been reported - 4,911,801,030 cycles # 2.653 GHz - 12,271,296,407 instructions # 2.50 insn per cycle - 1.852091714 seconds time elapsed + 4,914,422,282 cycles # 2.663 GHz + 12,272,016,530 instructions # 2.50 insn per cycle + 1.845933307 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.658898e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.670376e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.670376e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.694812e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.706153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.706153e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.472418 sec +TOTAL : 2.459293 sec INFO: No Floating Point Exceptions have been reported - 4,148,038,447 cycles # 1.675 GHz - 6,442,551,576 instructions # 1.55 insn per cycle - 2.476725391 seconds time elapsed + 4,148,774,251 cycles # 1.685 GHz + 6,442,210,372 instructions # 1.55 insn per cycle + 2.463549102 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 5b5bd116a3..2e640fb20e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_14:44:06 +DATE: 2024-06-02_21:00:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065678e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.066059e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066269e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.070046e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.070489e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.070728e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.433623 sec +TOTAL : 2.432122 sec INFO: No Floating Point Exceptions have been reported - 7,848,341,490 cycles # 2.847 GHz - 17,462,165,188 instructions # 2.22 insn per cycle - 2.813787845 seconds time elapsed + 7,909,240,752 cycles # 2.872 GHz + 18,000,344,677 instructions # 2.28 insn per cycle + 2.812238946 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.279047e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.280919e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.281197e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.257630e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.259727e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.259989e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.985795 sec +TOTAL : 3.998685 sec INFO: No Floating Point Exceptions have been reported - 12,317,841,072 cycles # 2.855 GHz - 29,065,647,551 instructions # 2.36 insn per cycle - 4.369250222 seconds time elapsed + 12,358,861,001 cycles # 2.857 GHz + 27,265,356,364 instructions # 2.21 insn per cycle + 4.380623068 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.769533e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.769744e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.769744e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.364227e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.364435e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.364435e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.808252 sec +TOTAL : 7.176405 sec INFO: No Floating Point Exceptions have been reported - 18,798,528,369 cycles # 2.760 GHz - 53,916,630,138 instructions # 2.87 insn per cycle - 6.812355714 seconds time elapsed + 18,821,710,602 cycles # 2.622 GHz + 53,917,723,661 instructions # 2.86 insn per cycle + 7.180812312 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.539980e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.540062e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.540062e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.537482e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.537565e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.537565e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.433152 sec +TOTAL : 3.438733 sec INFO: No Floating Point Exceptions have been reported - 9,799,231,624 cycles # 2.852 GHz - 27,092,581,938 instructions # 2.76 insn per cycle - 3.437235180 seconds time elapsed + 9,825,974,360 cycles # 2.855 GHz + 27,092,527,909 instructions # 2.76 insn per cycle + 3.442940335 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.326889e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.327284e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.327284e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.317063e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.317444e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.317444e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.592072 sec +TOTAL : 1.596530 sec INFO: No Floating Point Exceptions have been reported - 4,220,179,984 cycles # 2.645 GHz - 9,560,887,701 instructions # 2.27 insn per cycle - 1.596045022 seconds time elapsed + 4,226,902,337 cycles # 2.642 GHz + 9,560,928,493 instructions # 2.26 insn per cycle + 1.600685064 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.770010e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.770613e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.770613e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.692227e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.692715e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.692715e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.405649 sec +TOTAL : 1.434526 sec INFO: No Floating Point Exceptions have been reported - 3,726,923,548 cycles # 2.645 GHz - 8,484,897,516 instructions # 2.28 insn per cycle - 1.409716339 seconds time elapsed + 3,746,125,551 cycles # 2.606 GHz + 8,486,014,947 instructions # 2.27 insn per cycle + 1.438619859 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.281739e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.282258e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.282258e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.273656e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.274120e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.274120e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.614138 sec +TOTAL : 1.618156 sec INFO: No Floating Point Exceptions have been reported - 2,690,163,143 cycles # 1.663 GHz - 4,272,866,756 instructions # 1.59 insn per cycle - 1.618172762 seconds time elapsed + 2,695,756,195 cycles # 1.663 GHz + 4,273,774,333 instructions # 1.59 insn per cycle + 1.622321455 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 1be1cfeedf..1fadaabb4f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_15:03:52 +DATE: 2024-06-02_21:30:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.063403e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.064386e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.064386e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.065370e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.066381e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066381e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.395260 sec +TOTAL : 2.397754 sec INFO: No Floating Point Exceptions have been reported - 7,755,019,436 cycles # 2.852 GHz - 17,230,726,903 instructions # 2.22 insn per cycle - 2.775067655 seconds time elapsed + 7,762,497,768 cycles # 2.853 GHz + 16,203,483,295 instructions # 2.09 insn per cycle + 2.776448372 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.263412e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.297943e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.297943e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.238795e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.273940e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.273940e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.971807 sec +TOTAL : 3.985494 sec INFO: No Floating Point Exceptions have been reported - 12,283,103,403 cycles # 2.855 GHz - 27,758,308,143 instructions # 2.26 insn per cycle - 4.357937638 seconds time elapsed + 12,320,118,228 cycles # 2.858 GHz + 28,700,363,846 instructions # 2.33 insn per cycle + 4.365416695 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.361041e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.361236e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.361236e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.431447e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.431652e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.431652e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.177191 sec +TOTAL : 7.108151 sec INFO: No Floating Point Exceptions have been reported - 18,941,402,140 cycles # 2.638 GHz - 53,918,413,850 instructions # 2.85 insn per cycle - 7.181202320 seconds time elapsed + 18,792,897,710 cycles # 2.643 GHz + 53,918,227,536 instructions # 2.87 insn per cycle + 7.112225621 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.538220e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.538308e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.538308e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.546658e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.546743e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.546743e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.437404 sec +TOTAL : 3.418887 sec INFO: No Floating Point Exceptions have been reported - 9,826,263,323 cycles # 2.856 GHz - 27,093,421,705 instructions # 2.76 insn per cycle - 3.441586183 seconds time elapsed + 9,791,096,347 cycles # 2.861 GHz + 27,093,479,045 instructions # 2.77 insn per cycle + 3.423017015 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.320877e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.321321e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.321321e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.304069e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.304481e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.304481e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.595267 sec +TOTAL : 1.603100 sec INFO: No Floating Point Exceptions have been reported - 4,226,325,559 cycles # 2.644 GHz - 9,562,000,988 instructions # 2.26 insn per cycle - 1.599357751 seconds time elapsed + 4,241,407,347 cycles # 2.640 GHz + 9,561,955,028 instructions # 2.25 insn per cycle + 1.607196815 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.768698e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.769266e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.769266e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.744578e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.745149e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745149e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.406299 sec +TOTAL : 1.415314 sec INFO: No Floating Point Exceptions have been reported - 3,728,202,948 cycles # 2.645 GHz - 8,485,828,873 instructions # 2.28 insn per cycle - 1.410433353 seconds time elapsed + 3,737,020,554 cycles # 2.634 GHz + 8,486,765,632 instructions # 2.27 insn per cycle + 1.419451948 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.279021e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.279511e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.279511e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.289384e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.289882e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.289882e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.615850 sec +TOTAL : 1.610890 sec INFO: No Floating Point Exceptions have been reported - 2,693,497,833 cycles # 1.663 GHz - 4,273,840,765 instructions # 1.59 insn per cycle - 1.620067219 seconds time elapsed + 2,696,211,668 cycles # 1.670 GHz + 4,273,881,889 instructions # 1.59 insn per cycle + 1.615050450 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 3f519fda03..b7c9be9361 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_14:45:01 +DATE: 2024-06-02_21:01:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065431e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.065819e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066015e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.065481e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065914e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066085e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.431749 sec +TOTAL : 2.431533 sec INFO: No Floating Point Exceptions have been reported - 7,858,443,167 cycles # 2.852 GHz - 17,797,449,482 instructions # 2.26 insn per cycle - 2.810886675 seconds time elapsed + 7,864,728,245 cycles # 2.855 GHz + 16,581,142,625 instructions # 2.11 insn per cycle + 2.810896011 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.189050e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.190883e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.191132e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.234334e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.236420e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.236694e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.007219 sec +TOTAL : 4.002654 sec INFO: No Floating Point Exceptions have been reported - 12,388,147,716 cycles # 2.856 GHz - 29,572,084,158 instructions # 2.39 insn per cycle - 4.393669645 seconds time elapsed + 12,362,983,148 cycles # 2.857 GHz + 26,818,544,684 instructions # 2.17 insn per cycle + 4.385714343 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.555801e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.556015e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.556015e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.360451e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.360670e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.360670e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.995892 sec +TOTAL : 7.178712 sec INFO: No Floating Point Exceptions have been reported - 18,868,341,179 cycles # 2.696 GHz - 53,930,114,085 instructions # 2.86 insn per cycle - 6.999840535 seconds time elapsed + 18,903,471,951 cycles # 2.632 GHz + 53,926,959,837 instructions # 2.85 insn per cycle + 7.182807918 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.547899e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.547983e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.547983e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.554111e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.554204e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.554204e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.415669 sec +TOTAL : 3.401831 sec INFO: No Floating Point Exceptions have been reported - 9,762,163,827 cycles # 2.856 GHz - 27,089,755,364 instructions # 2.77 insn per cycle - 3.419663266 seconds time elapsed + 9,728,814,018 cycles # 2.857 GHz + 27,089,535,875 instructions # 2.78 insn per cycle + 3.405876690 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.328520e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.328923e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.328923e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.279450e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279875e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279875e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.591769 sec +TOTAL : 1.614318 sec INFO: No Floating Point Exceptions have been reported - 4,217,350,816 cycles # 2.647 GHz - 9,560,856,496 instructions # 2.27 insn per cycle - 1.595749154 seconds time elapsed + 4,271,811,996 cycles # 2.641 GHz + 9,560,879,429 instructions # 2.24 insn per cycle + 1.618409778 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.765235e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.765802e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.765802e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.724611e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.725106e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.725106e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.407302 sec +TOTAL : 1.422019 sec INFO: No Floating Point Exceptions have been reported - 3,737,969,275 cycles # 2.650 GHz - 8,484,674,655 instructions # 2.27 insn per cycle - 1.411406372 seconds time elapsed + 3,745,973,725 cycles # 2.628 GHz + 8,485,619,535 instructions # 2.27 insn per cycle + 1.426190804 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.273845e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.274322e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.274322e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.244405e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.244886e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.244886e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.617639 sec +TOTAL : 1.632336 sec INFO: No Floating Point Exceptions have been reported - 2,695,774,477 cycles # 1.663 GHz - 4,276,120,388 instructions # 1.59 insn per cycle - 1.621698890 seconds time elapsed + 2,716,368,412 cycles # 1.661 GHz + 4,277,085,599 instructions # 1.57 insn per cycle + 1.636609876 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8097294660..9454f64bcc 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_14:45:55 +DATE: 2024-06-02_21:02:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.560287e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.561087e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.561509e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.559368e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.560230e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.560569e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.688313 sec +TOTAL : 1.689610 sec INFO: No Floating Point Exceptions have been reported - 5,610,919,333 cycles # 2.843 GHz - 12,076,970,192 instructions # 2.15 insn per cycle - 2.032164963 seconds time elapsed + 5,589,146,316 cycles # 2.841 GHz + 11,119,486,865 instructions # 1.99 insn per cycle + 2.023529940 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.335524e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.336187e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.336332e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.312667e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.313583e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.313721e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.920912 sec +TOTAL : 1.937585 sec INFO: No Floating Point Exceptions have been reported - 6,262,064,127 cycles # 2.846 GHz - 13,866,454,713 instructions # 2.21 insn per cycle - 2.256561773 seconds time elapsed + 6,322,260,150 cycles # 2.852 GHz + 12,991,612,203 instructions # 2.05 insn per cycle + 2.273050569 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.473644e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.473896e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.473896e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.449095e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.449366e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.449366e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.236471 sec +TOTAL : 6.254155 sec INFO: No Floating Point Exceptions have been reported - 17,834,532,335 cycles # 2.858 GHz - 53,589,179,622 instructions # 3.00 insn per cycle - 6.240522901 seconds time elapsed + 17,924,189,515 cycles # 2.865 GHz + 53,589,289,728 instructions # 2.99 insn per cycle + 6.258212218 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.311314e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.311704e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.311704e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.313591e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.314004e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.314004e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.599839 sec +TOTAL : 1.598328 sec INFO: No Floating Point Exceptions have been reported - 4,578,829,094 cycles # 2.856 GHz - 13,761,810,246 instructions # 3.01 insn per cycle - 1.603811766 seconds time elapsed + 4,580,110,582 cycles # 2.860 GHz + 13,761,912,039 instructions # 3.00 insn per cycle + 1.602548280 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.636666e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.638274e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.638274e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.493632e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.495239e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.495239e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.800488 sec +TOTAL : 0.817743 sec INFO: No Floating Point Exceptions have been reported - 2,129,570,848 cycles # 2.649 GHz - 4,816,093,977 instructions # 2.26 insn per cycle - 0.804523713 seconds time elapsed + 2,143,262,417 cycles # 2.610 GHz + 4,816,174,375 instructions # 2.25 insn per cycle + 0.821948335 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.627354e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.629498e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.629498e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.535714e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.537917e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.537917e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.697299 sec +TOTAL : 0.705994 sec INFO: No Floating Point Exceptions have been reported - 1,857,131,979 cycles # 2.651 GHz - 4,273,320,598 instructions # 2.30 insn per cycle - 0.701213399 seconds time elapsed + 1,872,359,401 cycles # 2.639 GHz + 4,273,597,055 instructions # 2.28 insn per cycle + 0.710000254 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.540089e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.542023e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.542023e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.586004e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.587962e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.587962e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.812936 sec +TOTAL : 0.806945 sec INFO: No Floating Point Exceptions have been reported - 1,360,618,833 cycles # 1.668 GHz - 2,159,125,772 instructions # 1.59 insn per cycle - 0.816997353 seconds time elapsed + 1,355,141,176 cycles # 1.672 GHz + 2,158,222,960 instructions # 1.59 insn per cycle + 0.811081253 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 6d352d97ac..c3dad58c83 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_15:04:47 +DATE: 2024-06-02_21:31:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.582684e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.584567e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.584567e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.586248e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.588038e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.588038e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.633735 sec +TOTAL : 1.639511 sec INFO: No Floating Point Exceptions have been reported - 5,453,177,396 cycles # 2.847 GHz - 11,627,188,509 instructions # 2.13 insn per cycle - 1.971399165 seconds time elapsed + 5,458,859,849 cycles # 2.849 GHz + 11,717,497,877 instructions # 2.15 insn per cycle + 1.972700319 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.292389e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.306150e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.306150e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.304060e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.317709e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317709e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.932378 sec +TOTAL : 1.924656 sec INFO: No Floating Point Exceptions have been reported - 6,313,737,946 cycles # 2.848 GHz - 13,568,150,990 instructions # 2.15 insn per cycle - 2.274068662 seconds time elapsed + 6,299,130,122 cycles # 2.862 GHz + 13,932,257,788 instructions # 2.21 insn per cycle + 2.257564113 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.447529e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.447785e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.447785e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.449386e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.449634e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.449634e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.255519 sec +TOTAL : 6.252998 sec INFO: No Floating Point Exceptions have been reported - 17,871,844,477 cycles # 2.856 GHz - 53,590,423,890 instructions # 3.00 insn per cycle - 6.259496797 seconds time elapsed + 17,934,331,616 cycles # 2.867 GHz + 53,590,587,156 instructions # 2.99 insn per cycle + 6.257054326 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.319002e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.319403e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.319403e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.318510e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.318903e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.318903e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.595862 sec +TOTAL : 1.596218 sec INFO: No Floating Point Exceptions have been reported - 4,573,738,949 cycles # 2.860 GHz - 13,762,785,828 instructions # 3.01 insn per cycle - 1.599904345 seconds time elapsed + 4,578,862,735 cycles # 2.862 GHz + 13,762,757,180 instructions # 3.01 insn per cycle + 1.600379529 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.613525e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.615218e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.615218e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.561862e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.563430e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.563430e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.804350 sec +TOTAL : 0.810097 sec INFO: No Floating Point Exceptions have been reported - 2,139,167,872 cycles # 2.648 GHz - 4,817,111,626 instructions # 2.25 insn per cycle - 0.808508083 seconds time elapsed + 2,150,908,758 cycles # 2.644 GHz + 4,817,064,263 instructions # 2.24 insn per cycle + 0.814268690 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.603124e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.605216e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.605216e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.557169e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.559485e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.559485e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.699524 sec +TOTAL : 0.704248 sec INFO: No Floating Point Exceptions have been reported - 1,862,402,974 cycles # 2.650 GHz - 4,274,167,467 instructions # 2.29 insn per cycle - 0.703628099 seconds time elapsed + 1,860,720,782 cycles # 2.629 GHz + 4,274,198,133 instructions # 2.30 insn per cycle + 0.708306150 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.580110e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.582197e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.582197e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.582585e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.584548e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.584548e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.807828 sec +TOTAL : 0.807471 sec INFO: No Floating Point Exceptions have been reported - 1,354,037,726 cycles # 1.669 GHz - 2,159,114,420 instructions # 1.59 insn per cycle - 0.811949308 seconds time elapsed + 1,357,966,185 cycles # 1.671 GHz + 2,159,181,276 instructions # 1.59 insn per cycle + 0.813237032 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 2d3f5a3740..5816b2c2c2 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_14:46:34 +DATE: 2024-06-02_21:03:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.539024e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.539847e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.540280e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.532712e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.533546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.533906e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.689103 sec +TOTAL : 1.691126 sec INFO: No Floating Point Exceptions have been reported - 5,627,474,622 cycles # 2.847 GHz - 11,923,534,222 instructions # 2.12 insn per cycle - 2.035228412 seconds time elapsed + 5,622,580,002 cycles # 2.845 GHz + 11,510,934,287 instructions # 2.05 insn per cycle + 2.034693744 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.303686e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.304329e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.304469e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.322031e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.322933e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.323068e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.932451 sec +TOTAL : 1.930959 sec INFO: No Floating Point Exceptions have been reported - 6,311,455,519 cycles # 2.848 GHz - 13,762,708,375 instructions # 2.18 insn per cycle - 2.272906437 seconds time elapsed + 6,307,622,079 cycles # 2.854 GHz + 13,926,153,465 instructions # 2.21 insn per cycle + 2.267828569 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.477402e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.477656e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.477656e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.476563e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.476825e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.476825e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.233047 sec +TOTAL : 6.234385 sec INFO: No Floating Point Exceptions have been reported - 17,803,580,317 cycles # 2.855 GHz - 53,580,069,164 instructions # 3.01 insn per cycle - 6.237030677 seconds time elapsed + 17,827,850,406 cycles # 2.859 GHz + 53,580,311,893 instructions # 3.01 insn per cycle + 6.238431546 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.307415e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.307805e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.307805e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.320430e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.320855e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.320855e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.601039 sec +TOTAL : 1.594879 sec INFO: No Floating Point Exceptions have been reported - 4,572,009,891 cycles # 2.850 GHz - 13,755,353,111 instructions # 3.01 insn per cycle - 1.605120576 seconds time elapsed + 4,567,314,747 cycles # 2.858 GHz + 13,755,226,123 instructions # 3.01 insn per cycle + 1.598972704 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.585961e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.587683e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.587683e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.611819e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.613638e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.613638e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.806516 sec +TOTAL : 0.803348 sec INFO: No Floating Point Exceptions have been reported - 2,141,220,761 cycles # 2.644 GHz - 4,818,439,860 instructions # 2.25 insn per cycle - 0.810543510 seconds time elapsed + 2,141,149,617 cycles # 2.654 GHz + 4,818,402,736 instructions # 2.25 insn per cycle + 0.807515784 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.583228e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.585349e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.585349e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.568391e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.570476e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.570476e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.701072 sec +TOTAL : 0.704023 sec INFO: No Floating Point Exceptions have been reported - 1,870,651,613 cycles # 2.656 GHz - 4,275,203,774 instructions # 2.29 insn per cycle - 0.705038579 seconds time elapsed + 1,875,444,352 cycles # 2.651 GHz + 4,275,225,721 instructions # 2.28 insn per cycle + 0.708142027 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.570140e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.572065e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.572065e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.586825e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.588943e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.588943e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.808794 sec +TOTAL : 0.806684 sec INFO: No Floating Point Exceptions have been reported - 1,356,929,556 cycles # 1.671 GHz - 2,164,613,956 instructions # 1.60 insn per cycle - 0.812781092 seconds time elapsed + 1,360,116,629 cycles # 1.679 GHz + 2,164,473,202 instructions # 1.59 insn per cycle + 0.810866699 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3487) (512y: 34) (512z:79499) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index dfab5870bc..74b152faa4 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_14:47:13 +DATE: 2024-06-02_21:03:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.689455e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.689959e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690212e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.688684e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.689237e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.689554e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.179203 sec +TOTAL : 2.181191 sec INFO: No Floating Point Exceptions have been reported - 7,126,539,551 cycles # 2.849 GHz - 15,807,759,758 instructions # 2.22 insn per cycle - 2.559686036 seconds time elapsed + 7,135,001,611 cycles # 2.850 GHz + 13,803,494,373 instructions # 1.93 insn per cycle + 2.561335539 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.107889e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108154e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108192e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.110278e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110643e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110684e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.424856 sec +TOTAL : 3.414487 sec INFO: No Floating Point Exceptions have been reported - 10,729,635,772 cycles # 2.852 GHz - 25,204,058,412 instructions # 2.35 insn per cycle - 3.820430433 seconds time elapsed + 10,695,558,623 cycles # 2.859 GHz + 24,933,179,401 instructions # 2.33 insn per cycle + 3.798291401 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.303415e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.303607e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.303607e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.308623e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.308815e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.308815e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.238072 sec +TOTAL : 7.229792 sec INFO: No Floating Point Exceptions have been reported - 19,150,406,884 cycles # 2.645 GHz - 54,154,394,762 instructions # 2.83 insn per cycle - 7.242308052 seconds time elapsed + 19,160,887,501 cycles # 2.649 GHz + 54,158,064,644 instructions # 2.83 insn per cycle + 7.233871441 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.497154e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.497235e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.497235e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.495915e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.496004e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.496004e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.531757 sec +TOTAL : 3.534835 sec INFO: No Floating Point Exceptions have been reported - 9,343,938,644 cycles # 2.643 GHz - 26,158,830,842 instructions # 2.80 insn per cycle - 3.535758073 seconds time elapsed + 9,323,019,385 cycles # 2.635 GHz + 26,159,152,582 instructions # 2.81 insn per cycle + 3.538902916 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.453828e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.454275e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.454275e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.446264e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.446692e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.446692e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.534198 sec +TOTAL : 1.538022 sec INFO: No Floating Point Exceptions have been reported - 4,069,691,610 cycles # 2.648 GHz - 9,228,168,046 instructions # 2.27 insn per cycle - 1.538179495 seconds time elapsed + 4,070,123,740 cycles # 2.641 GHz + 9,227,321,198 instructions # 2.27 insn per cycle + 1.542213209 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.986335e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.986931e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.986931e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.010931e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.011595e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.011595e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.329366 sec +TOTAL : 1.321905 sec INFO: No Floating Point Exceptions have been reported - 3,528,184,184 cycles # 2.647 GHz - 8,174,614,993 instructions # 2.32 insn per cycle - 1.333451918 seconds time elapsed + 3,507,528,638 cycles # 2.647 GHz + 8,174,534,380 instructions # 2.33 insn per cycle + 1.326023150 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.367967e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.368468e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.368468e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.380863e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.381372e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.381372e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.573046 sec +TOTAL : 1.567452 sec INFO: No Floating Point Exceptions have been reported - 2,618,946,865 cycles # 1.661 GHz - 4,154,480,374 instructions # 1.59 insn per cycle - 1.577054610 seconds time elapsed + 2,624,094,649 cycles # 1.671 GHz + 4,154,491,609 instructions # 1.58 insn per cycle + 1.571532309 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 3ddfb4805b..8617043553 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-16_14:48:06 +DATE: 2024-06-02_21:04:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.679279e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.679786e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.680054e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.686084e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.686624e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.686882e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.178048 sec +TOTAL : 2.181122 sec INFO: No Floating Point Exceptions have been reported - 7,134,880,740 cycles # 2.851 GHz - 15,434,594,866 instructions # 2.16 insn per cycle - 2.558453633 seconds time elapsed + 7,147,611,369 cycles # 2.854 GHz + 14,853,271,942 instructions # 2.08 insn per cycle + 2.560951446 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.104221e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.104483e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.104525e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.108087e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108460e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108504e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.411600 sec +TOTAL : 3.416139 sec INFO: No Floating Point Exceptions have been reported - 10,672,973,002 cycles # 2.855 GHz - 24,521,846,399 instructions # 2.30 insn per cycle - 3.794724712 seconds time elapsed + 10,697,523,108 cycles # 2.858 GHz + 25,124,574,585 instructions # 2.35 insn per cycle + 3.798642472 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.893217e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.893439e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.893439e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.809282e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.809493e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.809493e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.694276 sec +TOTAL : 6.765846 sec INFO: No Floating Point Exceptions have been reported - 19,121,414,788 cycles # 2.855 GHz - 54,156,458,090 instructions # 2.83 insn per cycle - 6.698138270 seconds time elapsed + 19,320,733,855 cycles # 2.854 GHz + 54,152,931,560 instructions # 2.80 insn per cycle + 6.769909136 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.495395e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.495480e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.495480e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.498471e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.498552e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.498552e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.535528 sec +TOTAL : 3.528542 sec INFO: No Floating Point Exceptions have been reported - 9,391,010,006 cycles # 2.654 GHz - 26,079,707,862 instructions # 2.78 insn per cycle - 3.539600596 seconds time elapsed + 9,412,357,855 cycles # 2.665 GHz + 26,078,069,796 instructions # 2.77 insn per cycle + 3.532621942 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.518532e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518969e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.518969e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.535316e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.535751e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.535751e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.505616 sec +TOTAL : 1.497994 sec INFO: No Floating Point Exceptions have been reported - 4,001,150,405 cycles # 2.652 GHz - 9,212,868,850 instructions # 2.30 insn per cycle - 1.509560632 seconds time elapsed + 4,026,588,228 cycles # 2.682 GHz + 9,213,775,354 instructions # 2.29 insn per cycle + 1.502163303 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.985927e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.986486e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.986486e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.075585e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.076181e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.076181e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.329709 sec +TOTAL : 1.300193 sec INFO: No Floating Point Exceptions have been reported - 3,529,740,112 cycles # 2.648 GHz - 8,168,252,869 instructions # 2.31 insn per cycle - 1.333651402 seconds time elapsed + 3,527,583,777 cycles # 2.706 GHz + 8,167,337,738 instructions # 2.32 insn per cycle + 1.304488773 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.359545e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.360045e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.360045e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.471286e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.471806e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.471806e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.576721 sec +TOTAL : 1.527099 sec INFO: No Floating Point Exceptions have been reported - 2,623,702,370 cycles # 1.660 GHz - 4,153,356,804 instructions # 1.58 insn per cycle - 1.580839869 seconds time elapsed + 2,623,859,326 cycles # 1.714 GHz + 4,153,167,835 instructions # 1.58 insn per cycle + 1.531362339 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 38bc670a18..e2998d6ab4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_14:42:48 +DATE: 2024-06-02_20:59:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.927387e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.315718e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.634653e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.755226e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.274442e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.625510e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.452451 sec +TOTAL : 0.455971 sec INFO: No Floating Point Exceptions have been reported - 1,879,085,625 cycles # 2.815 GHz - 2,632,406,951 instructions # 1.40 insn per cycle - 0.724903288 seconds time elapsed + 1,884,775,023 cycles # 2.810 GHz + 2,642,675,236 instructions # 1.40 insn per cycle + 0.728830794 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.675522e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.208336e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.557322e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.160169e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.139027e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.542035e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.534593 sec +TOTAL : 0.535733 sec INFO: No Floating Point Exceptions have been reported - 2,165,913,457 cycles # 2.812 GHz - 3,139,398,529 instructions # 1.45 insn per cycle - 0.827804422 seconds time elapsed + 2,191,012,800 cycles # 2.827 GHz + 3,140,675,750 instructions # 1.43 insn per cycle + 0.832367820 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.011560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.033153e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.033153e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.013225e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.034611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034611e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.642980 sec +TOTAL : 1.640289 sec INFO: No Floating Point Exceptions have been reported - 4,710,402,412 cycles # 2.861 GHz - 13,462,495,012 instructions # 2.86 insn per cycle - 1.647108070 seconds time elapsed + 4,709,353,007 cycles # 2.865 GHz + 13,462,429,209 instructions # 2.86 insn per cycle + 1.644426545 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.839775e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.910542e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.910542e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.842535e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.913005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.913005e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.913158 sec +TOTAL : 0.912012 sec INFO: No Floating Point Exceptions have been reported - 2,620,816,977 cycles # 2.859 GHz - 7,551,970,333 instructions # 2.88 insn per cycle - 0.917276709 seconds time elapsed + 2,622,620,970 cycles # 2.864 GHz + 7,552,013,729 instructions # 2.88 insn per cycle + 0.916398164 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.970408e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.156692e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.156692e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.080547e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.281001e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.281001e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.574933 sec +TOTAL : 0.555084 sec INFO: No Floating Point Exceptions have been reported - 1,480,758,822 cycles # 2.560 GHz - 3,119,703,419 instructions # 2.11 insn per cycle - 0.579132992 seconds time elapsed + 1,478,897,839 cycles # 2.647 GHz + 3,119,129,700 instructions # 2.11 insn per cycle + 0.559398478 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.428889e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.682851e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.682851e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.427781e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.674591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.674591e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501020 sec +TOTAL : 0.501239 sec INFO: No Floating Point Exceptions have been reported - 1,347,520,276 cycles # 2.670 GHz - 2,981,434,055 instructions # 2.21 insn per cycle - 0.505363497 seconds time elapsed + 1,340,705,970 cycles # 2.655 GHz + 2,981,253,669 instructions # 2.22 insn per cycle + 0.505572840 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.241546e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.347861e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.347861e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.239263e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345599e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.756225 sec +TOTAL : 0.757058 sec INFO: No Floating Point Exceptions have been reported - 1,330,320,612 cycles # 1.751 GHz - 1,953,406,018 instructions # 1.47 insn per cycle - 0.760489864 seconds time elapsed + 1,333,648,350 cycles # 1.754 GHz + 1,953,454,025 instructions # 1.46 insn per cycle + 0.761354043 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 6f141963c0..ea21ef5e35 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_15:02:27 +DATE: 2024-06-02_21:28:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.428295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.103056e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.103056e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.419906e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.143705e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.143705e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.480636 sec +TOTAL : 0.485647 sec INFO: No Floating Point Exceptions have been reported - 1,959,891,585 cycles # 2.818 GHz - 2,927,619,706 instructions # 1.49 insn per cycle - 0.752080667 seconds time elapsed + 1,977,356,966 cycles # 2.831 GHz + 2,947,602,370 instructions # 1.49 insn per cycle + 0.756831394 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.157968e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.371122e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.371122e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.235107e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.556413e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.556413e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.764001 sec +TOTAL : 0.757410 sec INFO: No Floating Point Exceptions have been reported - 2,873,640,599 cycles # 2.829 GHz - 4,407,079,803 instructions # 1.53 insn per cycle - 1.073816079 seconds time elapsed + 2,885,831,764 cycles # 2.864 GHz + 4,411,934,467 instructions # 1.53 insn per cycle + 1.066148439 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.008642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.030371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.030371e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.023172e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045212e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045212e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.654853 sec +TOTAL : 1.632598 sec INFO: No Floating Point Exceptions have been reported - 4,747,034,662 cycles # 2.862 GHz - 13,469,694,473 instructions # 2.84 insn per cycle - 1.659302078 seconds time elapsed + 4,757,113,003 cycles # 2.909 GHz + 13,469,643,583 instructions # 2.83 insn per cycle + 1.637219700 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.820348e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.892211e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.892211e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.853468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.930951 sec +TOTAL : 0.915572 sec INFO: No Floating Point Exceptions have been reported - 2,665,977,292 cycles # 2.852 GHz - 7,601,998,240 instructions # 2.85 insn per cycle - 0.935555380 seconds time elapsed + 2,673,568,028 cycles # 2.908 GHz + 7,602,475,789 instructions # 2.84 insn per cycle + 0.920287554 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.068620e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.272960e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.272960e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.103941e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.314249e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.314249e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.564348 sec +TOTAL : 0.559050 sec INFO: No Floating Point Exceptions have been reported - 1,513,664,570 cycles # 2.669 GHz - 3,168,463,518 instructions # 2.09 insn per cycle - 0.568761168 seconds time elapsed + 1,524,948,008 cycles # 2.709 GHz + 3,168,482,011 instructions # 2.08 insn per cycle + 0.563565721 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.408389e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.655047e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.655047e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.488503e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.747329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.747329e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.511104 sec +TOTAL : 0.500126 sec INFO: No Floating Point Exceptions have been reported - 1,377,582,779 cycles # 2.675 GHz - 3,030,644,125 instructions # 2.20 insn per cycle - 0.515560343 seconds time elapsed + 1,382,997,758 cycles # 2.744 GHz + 3,030,723,769 instructions # 2.19 insn per cycle + 0.504745068 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.221799e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.329402e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.329402e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.290455e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.403634e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.403634e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.769820 sec +TOTAL : 0.749461 sec INFO: No Floating Point Exceptions have been reported - 1,366,102,927 cycles # 1.765 GHz - 1,991,071,116 instructions # 1.46 insn per cycle - 0.774386560 seconds time elapsed + 1,376,974,835 cycles # 1.828 GHz + 1,993,483,040 instructions # 1.45 insn per cycle + 0.754191421 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 25b8d3c885..e245581a8d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_14:43:01 +DATE: 2024-06-02_20:59:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.907170e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.197971e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.504611e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.726582e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.144906e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.472843e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455771 sec +TOTAL : 0.456749 sec INFO: No Floating Point Exceptions have been reported - 1,881,865,516 cycles # 2.813 GHz - 2,669,782,801 instructions # 1.42 insn per cycle - 0.727786761 seconds time elapsed + 1,887,155,378 cycles # 2.815 GHz + 2,674,611,026 instructions # 1.42 insn per cycle + 0.729330088 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.641992e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.081273e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.416654e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.182415e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.046068e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.438938e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.532301 sec +TOTAL : 0.537611 sec INFO: No Floating Point Exceptions have been reported - 2,167,822,822 cycles # 2.823 GHz - 3,120,353,321 instructions # 1.44 insn per cycle - 0.825343283 seconds time elapsed + 2,190,802,810 cycles # 2.823 GHz + 3,133,468,280 instructions # 1.43 insn per cycle + 0.833932090 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.007784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029112e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.029112e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.009988e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.031220e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.031220e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.648995 sec +TOTAL : 1.645685 sec INFO: No Floating Point Exceptions have been reported - 4,725,323,359 cycles # 2.860 GHz - 13,457,369,308 instructions # 2.85 insn per cycle - 1.653142214 seconds time elapsed + 4,722,807,018 cycles # 2.864 GHz + 13,456,640,489 instructions # 2.85 insn per cycle + 1.649922962 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.833913e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904030e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.814484e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.883785e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883785e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915889 sec +TOTAL : 0.926401 sec INFO: No Floating Point Exceptions have been reported - 2,628,184,982 cycles # 2.858 GHz - 7,551,273,836 instructions # 2.87 insn per cycle - 0.920086997 seconds time elapsed + 2,657,840,392 cycles # 2.859 GHz + 7,551,476,794 instructions # 2.84 insn per cycle + 0.930660111 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.116183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320457e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320457e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.099699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.310121e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.310121e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.548674 sec +TOTAL : 0.551668 sec INFO: No Floating Point Exceptions have been reported - 1,476,841,675 cycles # 2.675 GHz - 3,117,924,257 instructions # 2.11 insn per cycle - 0.552738607 seconds time elapsed + 1,479,744,333 cycles # 2.665 GHz + 3,118,004,055 instructions # 2.11 insn per cycle + 0.555928182 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.456247e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.706124e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.706124e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.438094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.687392e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.687392e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.496887 sec +TOTAL : 0.499527 sec INFO: No Floating Point Exceptions have been reported - 1,340,057,166 cycles # 2.677 GHz - 2,978,732,248 instructions # 2.22 insn per cycle - 0.501058940 seconds time elapsed + 1,342,931,484 cycles # 2.669 GHz + 2,978,966,446 instructions # 2.22 insn per cycle + 0.503782569 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.241283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.347840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.347840e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.249330e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.355241e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.355241e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.757118 sec +TOTAL : 0.753724 sec INFO: No Floating Point Exceptions have been reported - 1,329,966,748 cycles # 1.749 GHz - 1,951,787,640 instructions # 1.47 insn per cycle - 0.761356492 seconds time elapsed + 1,329,841,706 cycles # 1.756 GHz + 1,951,471,549 instructions # 1.47 insn per cycle + 0.758013007 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 88eaa7d80d..3a86532d9d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_14:43:15 +DATE: 2024-06-02_20:59:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.867335e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.223690e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.343650e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.556743e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.220581e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.348241e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.450195 sec +TOTAL : 0.446170 sec INFO: No Floating Point Exceptions have been reported - 1,886,543,936 cycles # 2.814 GHz - 2,627,629,254 instructions # 1.39 insn per cycle - 0.729554150 seconds time elapsed + 1,860,217,734 cycles # 2.814 GHz + 2,643,257,485 instructions # 1.42 insn per cycle + 0.717600829 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.183442e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.842494e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.962990e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.900642e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.805238e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969673e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.482910 sec +TOTAL : 0.486879 sec INFO: No Floating Point Exceptions have been reported - 1,994,696,147 cycles # 2.812 GHz - 2,828,466,882 instructions # 1.42 insn per cycle - 0.766894337 seconds time elapsed + 2,017,900,407 cycles # 2.822 GHz + 2,883,772,994 instructions # 1.43 insn per cycle + 0.772554185 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069532e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.093791e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.071641e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095909e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.553490 sec +TOTAL : 1.550578 sec INFO: No Floating Point Exceptions have been reported - 4,455,366,971 cycles # 2.862 GHz - 13,047,769,817 instructions # 2.93 insn per cycle - 1.557630020 seconds time elapsed + 4,458,106,622 cycles # 2.869 GHz + 13,047,664,900 instructions # 2.93 insn per cycle + 1.554777575 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.869084e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.052765e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.052765e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.867972e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.052825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.052825e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.592151 sec +TOTAL : 0.592805 sec INFO: No Floating Point Exceptions have been reported - 1,701,146,602 cycles # 2.856 GHz - 4,512,165,265 instructions # 2.65 insn per cycle - 0.596248693 seconds time elapsed + 1,702,248,935 cycles # 2.855 GHz + 4,512,704,282 instructions # 2.65 insn per cycle + 0.597040153 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.609679e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.315056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.315056e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.475375e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.145092e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.145092e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.313413 sec +TOTAL : 0.320934 sec INFO: No Floating Point Exceptions have been reported - 850,737,642 cycles # 2.684 GHz - 1,895,945,890 instructions # 2.23 insn per cycle - 0.317546154 seconds time elapsed + 853,448,782 cycles # 2.630 GHz + 1,896,008,778 instructions # 2.22 insn per cycle + 0.325158867 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.973396e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.785303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.785303e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.798508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.559778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.559778e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.295859 sec +TOTAL : 0.304197 sec INFO: No Floating Point Exceptions have been reported - 801,819,935 cycles # 2.679 GHz - 1,819,229,849 instructions # 2.27 insn per cycle - 0.299944027 seconds time elapsed + 803,501,124 cycles # 2.610 GHz + 1,818,839,783 instructions # 2.26 insn per cycle + 0.308391634 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.354956e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.770974e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.770974e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.329166e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.749966e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.749966e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.400076 sec +TOTAL : 0.402147 sec INFO: No Floating Point Exceptions have been reported - 733,009,701 cycles # 1.817 GHz - 1,304,250,799 instructions # 1.78 insn per cycle - 0.404216975 seconds time elapsed + 735,010,684 cycles # 1.812 GHz + 1,304,684,504 instructions # 1.78 insn per cycle + 0.406336425 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index b62a8a0309..94d91d36db 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_15:02:40 +DATE: 2024-06-02_21:29:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.337579e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.030007e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.030007e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.395370e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.197522e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.197522e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.462122 sec +TOTAL : 0.460501 sec INFO: No Floating Point Exceptions have been reported - 1,901,719,201 cycles # 2.816 GHz - 2,811,032,752 instructions # 1.48 insn per cycle - 0.731978994 seconds time elapsed + 1,922,773,738 cycles # 2.852 GHz + 2,838,003,502 instructions # 1.48 insn per cycle + 0.731092352 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.907303e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.566216e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.566216e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.025472e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.955203e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.955203e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.637098 sec +TOTAL : 0.631520 sec INFO: No Floating Point Exceptions have been reported - 2,459,040,544 cycles # 2.824 GHz - 3,715,271,980 instructions # 1.51 insn per cycle - 0.927773682 seconds time elapsed + 2,501,267,965 cycles # 2.864 GHz + 3,790,017,450 instructions # 1.52 insn per cycle + 0.930310379 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.068152e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092782e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.083915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108811e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108811e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.559403 sec +TOTAL : 1.537914 sec INFO: No Floating Point Exceptions have been reported - 4,475,912,555 cycles # 2.864 GHz - 13,052,235,712 instructions # 2.92 insn per cycle - 1.563691095 seconds time elapsed + 4,486,361,889 cycles # 2.911 GHz + 13,052,814,653 instructions # 2.91 insn per cycle + 1.542180616 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -138,15 +138,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.856394e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.039884e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.039884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.870842e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.059379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.059379e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.599599 sec +TOTAL : 0.597066 sec INFO: No Floating Point Exceptions have been reported - 1,723,185,860 cycles # 2.856 GHz - 4,560,285,596 instructions # 2.65 insn per cycle - 0.603925442 seconds time elapsed + 1,730,283,965 cycles # 2.880 GHz + 4,559,978,438 instructions # 2.64 insn per cycle + 0.601626043 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -167,15 +167,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.545801e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.241062e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.241062e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.495825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.177328e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.177328e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.321141 sec +TOTAL : 0.324345 sec INFO: No Floating Point Exceptions have been reported - 871,513,310 cycles # 2.683 GHz - 1,932,959,243 instructions # 2.22 insn per cycle - 0.325378385 seconds time elapsed + 873,246,909 cycles # 2.662 GHz + 1,932,851,891 instructions # 2.21 insn per cycle + 0.328693833 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -196,15 +196,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.891127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.696072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.696072e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.853914e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.645046e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.645046e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.304221 sec +TOTAL : 0.305954 sec INFO: No Floating Point Exceptions have been reported - 825,995,486 cycles # 2.683 GHz - 1,856,161,781 instructions # 2.25 insn per cycle - 0.308416114 seconds time elapsed + 825,418,764 cycles # 2.665 GHz + 1,855,748,763 instructions # 2.25 insn per cycle + 0.310269538 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -225,15 +225,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.307702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.720545e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.720545e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.313371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.736001e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.736001e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.408443 sec +TOTAL : 0.408387 sec INFO: No Floating Point Exceptions have been reported - 755,445,387 cycles # 1.833 GHz - 1,345,989,570 instructions # 1.78 insn per cycle - 0.412779323 seconds time elapsed + 758,493,445 cycles # 1.840 GHz + 1,345,737,027 instructions # 1.77 insn per cycle + 0.412742502 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index f782cd39a5..05c0e197eb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_14:43:27 +DATE: 2024-06-02_21:00:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.882997e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.225822e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.344729e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.474693e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.189633e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.316136e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.447687 sec +TOTAL : 0.447182 sec INFO: No Floating Point Exceptions have been reported - 1,891,564,072 cycles # 2.816 GHz - 2,660,739,786 instructions # 1.41 insn per cycle - 0.729746219 seconds time elapsed + 1,893,556,574 cycles # 2.816 GHz + 2,664,714,918 instructions # 1.41 insn per cycle + 0.729010855 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.107850e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.805980e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.921999e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.711155e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.788726e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932305e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.485790 sec +TOTAL : 0.487543 sec INFO: No Floating Point Exceptions have been reported - 1,996,906,378 cycles # 2.807 GHz - 2,867,667,096 instructions # 1.44 insn per cycle - 0.769333150 seconds time elapsed + 2,013,294,748 cycles # 2.813 GHz + 2,871,604,214 instructions # 1.43 insn per cycle + 0.773944888 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069812e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.094168e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.094168e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.069069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093696e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.093696e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.553096 sec +TOTAL : 1.554121 sec INFO: No Floating Point Exceptions have been reported - 4,454,505,799 cycles # 2.862 GHz - 13,029,391,838 instructions # 2.92 insn per cycle - 1.557292510 seconds time elapsed + 4,457,740,413 cycles # 2.864 GHz + 13,029,198,665 instructions # 2.92 insn per cycle + 1.558378600 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.876347e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.060596e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.060596e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.886512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.070924e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.070924e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.590255 sec +TOTAL : 0.588580 sec INFO: No Floating Point Exceptions have been reported - 1,693,495,983 cycles # 2.852 GHz - 4,508,141,451 instructions # 2.66 insn per cycle - 0.594398488 seconds time elapsed + 1,693,887,931 cycles # 2.861 GHz + 4,507,886,410 instructions # 2.66 insn per cycle + 0.592760485 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.574680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.273652e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.273652e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.575079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.270984e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.270984e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.315080 sec +TOTAL : 0.315242 sec INFO: No Floating Point Exceptions have been reported - 851,359,645 cycles # 2.672 GHz - 1,893,112,803 instructions # 2.22 insn per cycle - 0.319204462 seconds time elapsed + 850,868,776 cycles # 2.668 GHz + 1,892,927,301 instructions # 2.22 insn per cycle + 0.319517780 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.978403e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.785893e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.785893e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.005606e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.810836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.810836e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.295093 sec +TOTAL : 0.293957 sec INFO: No Floating Point Exceptions have been reported - 799,712,323 cycles # 2.678 GHz - 1,814,979,638 instructions # 2.27 insn per cycle - 0.299228201 seconds time elapsed + 798,554,126 cycles # 2.684 GHz + 1,814,787,943 instructions # 2.27 insn per cycle + 0.298209331 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.317992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.737735e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.737735e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.328007e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.744710e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.744710e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.402763 sec +TOTAL : 0.402038 sec INFO: No Floating Point Exceptions have been reported - 736,511,578 cycles # 1.812 GHz - 1,302,115,541 instructions # 1.77 insn per cycle - 0.406867415 seconds time elapsed + 736,423,468 cycles # 1.816 GHz + 1,301,837,346 instructions # 1.77 insn per cycle + 0.406202485 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1936) (512y: 32) (512z: 2382) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 265a4a7626..0c9965805b 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_14:43:39 +DATE: 2024-06-02_21:00:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.940149e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.336219e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.662963e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.769571e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.350019e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.722049e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.452840 sec +TOTAL : 0.450882 sec INFO: No Floating Point Exceptions have been reported - 1,880,363,198 cycles # 2.808 GHz - 2,677,692,820 instructions # 1.42 insn per cycle - 0.726161506 seconds time elapsed + 1,880,734,206 cycles # 2.821 GHz + 2,660,717,871 instructions # 1.41 insn per cycle + 0.723822735 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.684159e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.236315e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.588311e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.238115e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.167791e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.574879e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.530728 sec +TOTAL : 0.536771 sec INFO: No Floating Point Exceptions have been reported - 2,164,642,485 cycles # 2.821 GHz - 3,145,530,012 instructions # 1.45 insn per cycle - 0.824333778 seconds time elapsed + 2,185,532,891 cycles # 2.814 GHz + 3,100,266,437 instructions # 1.42 insn per cycle + 0.833787821 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.003476e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.024445e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.024445e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.000819e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.021896e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.021896e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.655769 sec +TOTAL : 1.660472 sec INFO: No Floating Point Exceptions have been reported - 4,745,491,139 cycles # 2.860 GHz - 13,466,039,366 instructions # 2.84 insn per cycle - 1.659848552 seconds time elapsed + 4,752,171,523 cycles # 2.856 GHz + 13,466,883,992 instructions # 2.83 insn per cycle + 1.664681738 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.849332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.920343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.920343e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.835752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.905918e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905918e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.908133 sec +TOTAL : 0.915273 sec INFO: No Floating Point Exceptions have been reported - 2,605,721,632 cycles # 2.858 GHz - 7,384,650,569 instructions # 2.83 insn per cycle - 0.912227813 seconds time elapsed + 2,607,019,310 cycles # 2.837 GHz + 7,384,430,613 instructions # 2.83 insn per cycle + 0.919590344 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.133010e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.340359e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.340359e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.096035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.299954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299954e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.546033 sec +TOTAL : 0.552392 sec INFO: No Floating Point Exceptions have been reported - 1,469,888,298 cycles # 2.674 GHz - 3,055,461,884 instructions # 2.08 insn per cycle - 0.550169150 seconds time elapsed + 1,470,210,795 cycles # 2.644 GHz + 3,054,979,092 instructions # 2.08 insn per cycle + 0.556662961 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.544324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.807645e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.807645e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.519152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.779200e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.779200e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.485213 sec +TOTAL : 0.488880 sec INFO: No Floating Point Exceptions have been reported - 1,307,959,720 cycles # 2.676 GHz - 2,930,377,532 instructions # 2.24 insn per cycle - 0.489382978 seconds time elapsed + 1,309,516,900 cycles # 2.658 GHz + 2,929,953,488 instructions # 2.24 insn per cycle + 0.493142435 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.172350e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.272043e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.272043e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.182113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.279394e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.279394e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.779592 sec +TOTAL : 0.776228 sec INFO: No Floating Point Exceptions have been reported - 1,368,592,699 cycles # 1.747 GHz - 1,969,378,714 instructions # 1.44 insn per cycle - 0.783958712 seconds time elapsed + 1,367,728,530 cycles # 1.754 GHz + 1,969,246,999 instructions # 1.44 insn per cycle + 0.780474856 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 84e80111cc..9ad9b977c8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-16_14:43:53 +DATE: 2024-06-02_21:00:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.890956e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.181054e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.513059e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.725668e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.143482e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.487281e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.453973 sec +TOTAL : 0.455675 sec INFO: No Floating Point Exceptions have been reported - 1,876,167,670 cycles # 2.808 GHz - 2,662,885,558 instructions # 1.42 insn per cycle - 0.726739496 seconds time elapsed + 1,885,065,360 cycles # 2.812 GHz + 2,651,104,600 instructions # 1.41 insn per cycle + 0.728600735 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.642147e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.081360e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.416296e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.168974e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.005384e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.392584e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.533908 sec +TOTAL : 0.537275 sec INFO: No Floating Point Exceptions have been reported - 2,163,893,097 cycles # 2.818 GHz - 3,132,561,280 instructions # 1.45 insn per cycle - 0.826852700 seconds time elapsed + 2,188,989,186 cycles # 2.824 GHz + 3,146,111,555 instructions # 1.44 insn per cycle + 0.833354304 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.007176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.028375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.028375e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.007306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.028735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028735e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.650928 sec +TOTAL : 1.649832 sec INFO: No Floating Point Exceptions have been reported - 4,733,031,285 cycles # 2.861 GHz - 13,451,191,160 instructions # 2.84 insn per cycle - 1.655053441 seconds time elapsed + 4,736,304,640 cycles # 2.865 GHz + 13,451,261,336 instructions # 2.84 insn per cycle + 1.654076225 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.847760e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.919370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.919370e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.856269e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927884e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.909156 sec +TOTAL : 0.905701 sec INFO: No Floating Point Exceptions have been reported - 2,606,818,939 cycles # 2.857 GHz - 7,388,977,556 instructions # 2.83 insn per cycle - 0.913243210 seconds time elapsed + 2,609,412,517 cycles # 2.870 GHz + 7,388,220,177 instructions # 2.83 insn per cycle + 0.909956215 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.915489e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.093943e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.093943e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.132823e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.336633e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.336633e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.585235 sec +TOTAL : 0.545981 sec INFO: No Floating Point Exceptions have been reported - 1,469,957,671 cycles # 2.496 GHz - 3,055,084,256 instructions # 2.08 insn per cycle - 0.589443028 seconds time elapsed + 1,469,511,109 cycles # 2.674 GHz + 3,055,566,040 instructions # 2.08 insn per cycle + 0.550174078 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.535422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.797003e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.797003e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.536658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.797118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797118e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.486063 sec +TOTAL : 0.486288 sec INFO: No Floating Point Exceptions have been reported - 1,306,700,125 cycles # 2.669 GHz - 2,930,583,524 instructions # 2.24 insn per cycle - 0.490171496 seconds time elapsed + 1,307,118,138 cycles # 2.675 GHz + 2,931,084,096 instructions # 2.24 insn per cycle + 0.490604178 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.173668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.273111e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.273111e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.194909e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.292767e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292767e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.778991 sec +TOTAL : 0.771280 sec INFO: No Floating Point Exceptions have been reported - 1,367,910,665 cycles # 1.749 GHz - 1,969,371,455 instructions # 1.44 insn per cycle - 0.783143035 seconds time elapsed + 1,369,689,187 cycles # 1.768 GHz + 1,969,498,668 instructions # 1.44 insn per cycle + 0.775554324 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 8af6873425..529929a5c3 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-16_15:20:33 +DATE: 2024-06-02_22:05:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.588343e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081541e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.176224e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.419887e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.089577e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.185165e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.528808 sec +TOTAL : 0.533774 sec INFO: No Floating Point Exceptions have been reported - 2,192,111,166 cycles # 2.821 GHz - 3,135,008,318 instructions # 1.43 insn per cycle - 0.833908791 seconds time elapsed + 2,165,329,100 cycles # 2.817 GHz + 3,116,509,991 instructions # 1.44 insn per cycle + 0.826546475 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865233e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.915227e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.915227e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.865718e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.915718e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.915718e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 5.734356 sec +TOTAL : 5.734321 sec INFO: No Floating Point Exceptions have been reported - 16,430,057,220 cycles # 2.863 GHz - 42,484,854,801 instructions # 2.59 insn per cycle - 5.739849036 seconds time elapsed + 16,442,910,174 cycles # 2.865 GHz + 42,483,732,959 instructions # 2.58 insn per cycle + 5.739954746 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 711) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.235376e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.401567e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.401567e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.238803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.404775e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.404775e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.353450 sec +TOTAL : 3.350989 sec INFO: No Floating Point Exceptions have been reported - 9,612,345,009 cycles # 2.863 GHz - 26,317,248,003 instructions # 2.74 insn per cycle - 3.358813940 seconds time elapsed + 9,605,090,400 cycles # 2.862 GHz + 26,316,930,760 instructions # 2.74 insn per cycle + 3.356479084 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2388) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.244474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.678972e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.678972e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.211332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.644021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.644021e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.111219 sec +TOTAL : 2.125658 sec INFO: No Floating Point Exceptions have been reported - 5,673,148,574 cycles # 2.682 GHz - 12,029,125,150 instructions # 2.12 insn per cycle - 2.116589548 seconds time elapsed + 5,695,554,831 cycles # 2.674 GHz + 12,026,163,349 instructions # 2.11 insn per cycle + 2.131034660 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2532) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.759844e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.282682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.282682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.667087e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.177658e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.177658e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 1.934603 sec +TOTAL : 1.963849 sec INFO: No Floating Point Exceptions have been reported - 5,185,525,755 cycles # 2.675 GHz - 11,158,849,555 instructions # 2.15 insn per cycle - 1.940086470 seconds time elapsed + 5,196,538,426 cycles # 2.640 GHz + 11,156,532,822 instructions # 2.15 insn per cycle + 1.969477022 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2195) (512y: 148) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.492671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.676216e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.676216e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.473166e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.654961e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.654961e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.114794 sec +TOTAL : 3.132981 sec INFO: No Floating Point Exceptions have been reported - 5,530,850,143 cycles # 1.773 GHz - 8,071,834,418 instructions # 1.46 insn per cycle - 3.120392658 seconds time elapsed + 5,562,641,747 cycles # 1.773 GHz + 8,071,126,847 instructions # 1.45 insn per cycle + 3.138593943 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1471) (512y: 129) (512z: 1684) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index 746b04ecac..50bff49e4f 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-16_15:20:58 +DATE: 2024-06-02_22:06:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.594523e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092654e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.188255e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.425736e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093257e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.189942e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.533303 sec +TOTAL : 0.532512 sec INFO: No Floating Point Exceptions have been reported - 2,159,610,833 cycles # 2.816 GHz - 3,095,961,302 instructions # 1.43 insn per cycle - 0.825364511 seconds time elapsed + 2,171,060,205 cycles # 2.823 GHz + 3,109,256,727 instructions # 1.43 insn per cycle + 0.825854836 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.884407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.935333e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.935333e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.886140e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.937260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.937260e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 5.676327 sec +TOTAL : 5.672763 sec INFO: No Floating Point Exceptions have been reported - 16,262,813,557 cycles # 2.863 GHz - 43,266,807,177 instructions # 2.66 insn per cycle - 5.681729392 seconds time elapsed + 16,265,601,075 cycles # 2.865 GHz + 43,265,334,700 instructions # 2.66 insn per cycle + 5.678175360 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 662) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.290556e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.463505e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.463505e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.297388e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.470079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.470079e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.298765 sec +TOTAL : 3.294375 sec INFO: No Floating Point Exceptions have been reported - 9,454,937,516 cycles # 2.862 GHz - 25,430,832,847 instructions # 2.69 insn per cycle - 3.304226277 seconds time elapsed + 9,446,450,203 cycles # 2.864 GHz + 25,429,379,126 instructions # 2.69 insn per cycle + 3.299817979 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2268) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.695348e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.042916e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.042916e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.653725e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.997172e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.997172e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.345615 sec +TOTAL : 2.366874 sec INFO: No Floating Point Exceptions have been reported - 6,296,882,273 cycles # 2.679 GHz - 13,638,682,807 instructions # 2.17 insn per cycle - 2.351107442 seconds time elapsed + 6,282,545,209 cycles # 2.649 GHz + 13,637,137,621 instructions # 2.17 insn per cycle + 2.372362603 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2629) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.910957e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.286382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.286382e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.883827e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.255659e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.255659e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.246822 sec +TOTAL : 2.260406 sec INFO: No Floating Point Exceptions have been reported - 6,026,491,701 cycles # 2.677 GHz - 12,722,860,113 instructions # 2.11 insn per cycle - 2.252413644 seconds time elapsed + 6,053,972,710 cycles # 2.673 GHz + 12,722,135,295 instructions # 2.10 insn per cycle + 2.265888998 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 296) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.420299e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.596534e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.596534e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.425130e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.601710e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601710e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.177504 sec +TOTAL : 3.176107 sec INFO: No Floating Point Exceptions have been reported - 5,627,100,070 cycles # 1.769 GHz - 8,928,441,764 instructions # 1.59 insn per cycle - 3.183062200 seconds time elapsed + 5,633,600,912 cycles # 1.772 GHz + 8,927,465,538 instructions # 1.58 insn per cycle + 3.181704052 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1357) (512y: 171) (512z: 1777) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index a9079e9716..2f0a202d23 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-16_15:21:23 +DATE: 2024-06-02_22:06:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.566221e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.504693e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.775023e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.564465e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.483919e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773757e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.485925 sec +TOTAL : 0.486107 sec INFO: No Floating Point Exceptions have been reported - 2,003,287,538 cycles # 2.816 GHz - 2,880,414,118 instructions # 1.44 insn per cycle - 0.769648039 seconds time elapsed + 2,007,999,810 cycles # 2.821 GHz + 2,899,840,704 instructions # 1.44 insn per cycle + 0.768580858 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.938364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.994818e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.994818e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.944128e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.000566e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.000566e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 5.499884 sec +TOTAL : 5.484463 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 15,743,516,639 cycles # 2.861 GHz - 42,225,863,593 instructions # 2.68 insn per cycle - 5.505101290 seconds time elapsed + 15,737,215,094 cycles # 2.868 GHz + 42,223,129,627 instructions # 2.68 insn per cycle + 5.489744872 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 601) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.494085e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.834702e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.834702e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.521211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.866032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.866032e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.423560 sec +TOTAL : 2.410232 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,948,197,620 cycles # 2.861 GHz - 16,919,710,710 instructions # 2.44 insn per cycle - 2.428887408 seconds time elapsed + 6,949,798,990 cycles # 2.877 GHz + 16,918,935,545 instructions # 2.43 insn per cycle + 2.415585542 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2983) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -141,15 +141,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.820914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.816967e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.816967e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.866213e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.881041e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.881041e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.429543 sec +TOTAL : 1.422293 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,855,960,900 cycles # 2.689 GHz - 7,989,689,028 instructions # 2.07 insn per cycle - 1.434693752 seconds time elapsed + 3,860,334,790 cycles # 2.706 GHz + 7,989,354,890 instructions # 2.07 insn per cycle + 1.427639931 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3289) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -171,15 +171,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.282128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.407558e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.407558e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.312467e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.446145e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.446145e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.355217 sec +TOTAL : 1.350232 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,662,603,190 cycles # 2.693 GHz - 7,491,885,625 instructions # 2.05 insn per cycle - 1.360533114 seconds time elapsed + 3,671,588,520 cycles # 2.710 GHz + 7,492,175,118 instructions # 2.04 insn per cycle + 1.355454952 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3036) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -201,15 +201,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.072932e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.653576e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.653576e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.291103e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.907078e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.907078e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.816585 sec +TOTAL : 1.755277 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,322,287,385 cycles # 1.825 GHz - 5,988,754,595 instructions # 1.80 insn per cycle - 1.821834164 seconds time elapsed + 3,329,330,756 cycles # 1.892 GHz + 5,989,173,339 instructions # 1.80 insn per cycle + 1.760500099 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2418) (512y: 32) (512z: 2031) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 0359df7b77..947a9772a4 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-16_15:21:44 +DATE: 2024-06-02_22:06:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.575897e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.505600e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.778243e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.661514e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.491247e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.780216e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.488373 sec +TOTAL : 0.485941 sec INFO: No Floating Point Exceptions have been reported - 2,007,752,645 cycles # 2.812 GHz - 2,828,437,251 instructions # 1.41 insn per cycle - 0.772837040 seconds time elapsed + 2,010,449,048 cycles # 2.824 GHz + 2,892,288,272 instructions # 1.44 insn per cycle + 0.769427686 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.991117e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.050649e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.050649e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.994628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.054364e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.054364e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 5.356246 sec +TOTAL : 5.347389 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 15,339,535,429 cycles # 2.862 GHz - 42,474,905,629 instructions # 2.77 insn per cycle - 5.361339903 seconds time elapsed + 15,338,140,112 cycles # 2.867 GHz + 42,471,920,214 instructions # 2.77 insn per cycle + 5.352646805 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 559) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.134209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.583662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.583662e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.117220e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.566866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.566866e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.132369 sec +TOTAL : 2.140174 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,119,263,046 cycles # 2.864 GHz - 16,261,701,502 instructions # 2.66 insn per cycle - 2.137647028 seconds time elapsed + 6,135,855,566 cycles # 2.861 GHz + 16,262,350,066 instructions # 2.65 insn per cycle + 2.145522102 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2702) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -141,15 +141,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.498649e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.173623e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.173623e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.475476e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.144559e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.144559e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.703269 sec +TOTAL : 1.709024 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,581,699,390 cycles # 2.683 GHz - 9,041,394,873 instructions # 1.97 insn per cycle - 1.708700782 seconds time elapsed + 4,596,639,303 cycles # 2.683 GHz + 9,041,859,622 instructions # 1.97 insn per cycle + 1.714357652 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3558) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -171,15 +171,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.705142e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.424759e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.424759e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.643193e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.350005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.350005e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.652652 sec +TOTAL : 1.667959 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,411,023,052 cycles # 2.662 GHz - 8,532,140,610 instructions # 1.93 insn per cycle - 1.658018216 seconds time elapsed + 4,476,334,932 cycles # 2.676 GHz + 8,532,641,638 instructions # 1.91 insn per cycle + 1.673325231 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3311) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -201,15 +201,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.118773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.709641e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.709641e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.116930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.706081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.706081e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.803301 sec +TOTAL : 1.804265 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,302,699,675 cycles # 1.827 GHz - 5,958,419,273 instructions # 1.80 insn per cycle - 1.808538430 seconds time elapsed + 3,310,050,790 cycles # 1.830 GHz + 5,957,409,151 instructions # 1.80 insn per cycle + 1.809617116 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2357) (512y: 32) (512z: 2014) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 4345b3c851..4f4847b6b6 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-16_15:22:04 +DATE: 2024-06-02_22:07:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.596790e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.087710e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.182609e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.488531e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088944e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184171e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.531791 sec +TOTAL : 0.530669 sec INFO: No Floating Point Exceptions have been reported - 2,158,479,665 cycles # 2.816 GHz - 3,115,947,911 instructions # 1.44 insn per cycle - 0.824595914 seconds time elapsed + 2,163,522,979 cycles # 2.824 GHz + 3,135,447,150 instructions # 1.45 insn per cycle + 0.823757872 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.739846e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.783362e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.783362e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.700696e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.742660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.742660e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.139062 sec +TOTAL : 6.280235 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 17,579,172,412 cycles # 2.862 GHz - 41,767,715,738 instructions # 2.38 insn per cycle - 6.144566394 seconds time elapsed + 17,574,906,678 cycles # 2.797 GHz + 41,764,388,623 instructions # 2.38 insn per cycle + 6.285714574 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.944235e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.080846e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.080846e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.952464e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.091303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.091303e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.674671 sec +TOTAL : 3.665315 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 10,157,870,701 cycles # 2.761 GHz - 26,355,211,403 instructions # 2.59 insn per cycle - 3.680088821 seconds time elapsed + 10,271,038,566 cycles # 2.799 GHz + 26,354,751,502 instructions # 2.57 insn per cycle + 3.670825968 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2438) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -141,15 +141,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.512494e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.830362e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.830362e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.531327e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.854267e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.854267e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.435332 sec +TOTAL : 2.428471 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,512,604,303 cycles # 2.669 GHz - 12,120,159,732 instructions # 1.86 insn per cycle - 2.440902409 seconds time elapsed + 6,507,055,363 cycles # 2.675 GHz + 12,119,284,734 instructions # 1.86 insn per cycle + 2.434107519 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2718) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -171,15 +171,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.920988e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.300442e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.300442e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.902513e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.278233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.278233e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.244169 sec +TOTAL : 2.252469 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,018,583,564 cycles # 2.676 GHz - 11,228,279,694 instructions # 1.87 insn per cycle - 2.249711111 seconds time elapsed + 6,021,388,550 cycles # 2.667 GHz + 11,226,998,655 instructions # 1.86 insn per cycle + 2.257985842 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2369) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -201,15 +201,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.148571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.297302e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.297302e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.151456e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.300586e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.300586e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.442171 sec +TOTAL : 3.440903 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,072,730,798 cycles # 1.762 GHz - 8,215,005,190 instructions # 1.35 insn per cycle - 3.447734816 seconds time elapsed + 6,082,239,634 cycles # 1.765 GHz + 8,214,081,257 instructions # 1.35 insn per cycle + 3.446457125 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1787) (512y: 134) (512z: 1755) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index fc67fec042..a2ade5f790 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-16_15:22:31 +DATE: 2024-06-02_22:07:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.615689e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096145e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.193163e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.485563e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095251e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.191531e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.527662 sec +TOTAL : 0.530779 sec INFO: No Floating Point Exceptions have been reported - 2,187,091,067 cycles # 2.822 GHz - 3,143,599,790 instructions # 1.44 insn per cycle - 0.831715891 seconds time elapsed + 2,189,344,982 cycles # 2.825 GHz + 3,162,111,956 instructions # 1.44 insn per cycle + 0.831473815 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.750132e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.794255e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.794255e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.752561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.796583e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.796583e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.103500 sec +TOTAL : 6.097431 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 17,473,867,626 cycles # 2.861 GHz - 43,052,630,037 instructions # 2.46 insn per cycle - 6.108967949 seconds time elapsed + 17,482,218,633 cycles # 2.865 GHz + 43,049,154,317 instructions # 2.46 insn per cycle + 6.102997010 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -111,15 +111,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.176372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.336517e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.336517e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.172478e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332063e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.414423 sec +TOTAL : 3.418913 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 9,783,940,024 cycles # 2.862 GHz - 25,167,910,576 instructions # 2.57 insn per cycle - 3.420037518 seconds time elapsed + 9,801,825,304 cycles # 2.863 GHz + 25,166,361,997 instructions # 2.57 insn per cycle + 3.424631390 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -141,15 +141,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.178030e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.451835e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.451835e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.163570e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.437386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.437386e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.622185 sec +TOTAL : 2.632138 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,019,924,583 cycles # 2.672 GHz - 12,790,606,448 instructions # 1.82 insn per cycle - 2.627804246 seconds time elapsed + 7,035,871,657 cycles # 2.669 GHz + 12,789,981,390 instructions # 1.82 insn per cycle + 2.637561799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2699) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -171,15 +171,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.488078e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.801083e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.801083e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.487333e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.798535e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.798535e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.447720 sec +TOTAL : 2.449076 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,546,937,322 cycles # 2.670 GHz - 12,109,881,739 instructions # 1.85 insn per cycle - 2.453162643 seconds time elapsed + 6,545,009,203 cycles # 2.667 GHz + 12,105,117,349 instructions # 1.85 insn per cycle + 2.454490824 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2351) (512y: 227) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -201,15 +201,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.983756e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.117708e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.117708e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.973614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.106585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.106585e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.627336 sec +TOTAL : 3.639191 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,377,996,877 cycles # 1.756 GHz - 8,984,744,450 instructions # 1.41 insn per cycle - 3.632964633 seconds time elapsed + 6,417,633,310 cycles # 1.761 GHz + 8,985,370,621 instructions # 1.40 insn per cycle + 3.644757809 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1892) (512y: 178) (512z: 2083) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index f2a95b68c4..3909c2de90 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-16_15:19:32 +DATE: 2024-06-02_22:04:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.205899e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.229515e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.233614e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.208514e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.234770e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.239030e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.467479 sec +TOTAL : 0.465857 sec INFO: No Floating Point Exceptions have been reported - 1,929,394,895 cycles # 2.809 GHz - 2,774,653,842 instructions # 1.44 insn per cycle - 0.745241861 seconds time elapsed + 1,954,339,048 cycles # 2.816 GHz + 2,841,818,682 instructions # 1.45 insn per cycle + 0.751002948 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.854750e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.994181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.003911e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.793700e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.949156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.959741e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.485365 sec +TOTAL : 0.483528 sec INFO: No Floating Point Exceptions have been reported - 1,990,830,698 cycles # 2.816 GHz - 2,942,277,354 instructions # 1.48 insn per cycle - 0.765598417 seconds time elapsed + 1,990,538,703 cycles # 2.818 GHz + 2,859,145,317 instructions # 1.44 insn per cycle + 0.763660982 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.339413e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.342602e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.342602e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.333630e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.336951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.336951e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.163415 sec +TOTAL : 0.163551 sec INFO: No Floating Point Exceptions have been reported - 474,956,853 cycles # 2.847 GHz - 1,396,923,375 instructions # 2.94 insn per cycle - 0.167372542 seconds time elapsed + 475,454,071 cycles # 2.848 GHz + 1,396,942,135 instructions # 2.94 insn per cycle + 0.167517734 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3991) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.350685e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.362490e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.362490e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.379547e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.391026e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.391026e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.088282 sec +TOTAL : 0.087740 sec INFO: No Floating Point Exceptions have been reported - 246,129,842 cycles # 2.680 GHz - 699,160,574 instructions # 2.84 insn per cycle - 0.092454839 seconds time elapsed + 245,466,319 cycles # 2.692 GHz + 699,170,520 instructions # 2.85 insn per cycle + 0.091863831 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9501) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.421076e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.426847e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.426847e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.397405e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.403101e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.403101e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.042262 sec +TOTAL : 0.042780 sec INFO: No Floating Point Exceptions have been reported - 120,513,094 cycles # 2.641 GHz - 260,079,134 instructions # 2.16 insn per cycle - 0.046206481 seconds time elapsed + 121,204,590 cycles # 2.623 GHz + 260,141,578 instructions # 2.15 insn per cycle + 0.046815365 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8227) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.614262e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.622122e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.622122e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.586602e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.593859e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.593859e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037855 sec +TOTAL : 0.038358 sec INFO: No Floating Point Exceptions have been reported - 109,022,775 cycles # 2.645 GHz - 240,308,972 instructions # 2.20 insn per cycle - 0.041904895 seconds time elapsed + 108,920,181 cycles # 2.610 GHz + 240,176,540 instructions # 2.21 insn per cycle + 0.042378717 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7348) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.170349e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175260e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.175260e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.175182e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180276e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180276e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.050252 sec +TOTAL : 0.050174 sec INFO: No Floating Point Exceptions have been reported - 96,595,554 cycles # 1.802 GHz - 138,452,128 instructions # 1.43 insn per cycle - 0.054148545 seconds time elapsed + 97,067,712 cycles # 1.812 GHz + 138,415,288 instructions # 1.43 insn per cycle + 0.054229752 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1692) (512y: 126) (512z: 6592) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index ca894b0a6d..65eb2e6009 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-16_15:19:42 +DATE: 2024-06-02_22:04:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.237277e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.263102e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.267367e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.239779e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.264296e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.268375e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.467317 sec +TOTAL : 0.465734 sec INFO: No Floating Point Exceptions have been reported - 1,933,877,717 cycles # 2.813 GHz - 2,829,779,417 instructions # 1.46 insn per cycle - 0.746133258 seconds time elapsed + 1,944,767,230 cycles # 2.813 GHz + 2,830,393,799 instructions # 1.46 insn per cycle + 0.748614808 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.945887e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.087010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.096853e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.968348e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.116174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.125970e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483889 sec +TOTAL : 0.483276 sec INFO: No Floating Point Exceptions have been reported - 2,005,783,112 cycles # 2.816 GHz - 2,927,359,248 instructions # 1.46 insn per cycle - 0.768925329 seconds time elapsed + 2,014,046,726 cycles # 2.816 GHz + 2,960,547,521 instructions # 1.47 insn per cycle + 0.771364825 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.344408e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347652e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.347652e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.352099e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.355611e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.355611e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.162339 sec +TOTAL : 0.161935 sec INFO: No Floating Point Exceptions have been reported - 471,806,818 cycles # 2.848 GHz - 1,391,948,601 instructions # 2.95 insn per cycle - 0.166295977 seconds time elapsed + 471,513,746 cycles # 2.852 GHz + 1,391,998,687 instructions # 2.95 insn per cycle + 0.165915651 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3869) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.367799e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.379601e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.379601e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.340411e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.352264e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.352264e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.087176 sec +TOTAL : 0.087524 sec INFO: No Floating Point Exceptions have been reported - 243,999,829 cycles # 2.694 GHz - 695,186,413 instructions # 2.85 insn per cycle - 0.091139423 seconds time elapsed + 244,692,087 cycles # 2.689 GHz + 695,265,791 instructions # 2.84 insn per cycle + 0.091623255 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9537) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.395387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.400899e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.400899e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.384610e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.390602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.390602e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.042187 sec +TOTAL : 0.042428 sec INFO: No Floating Point Exceptions have been reported - 119,801,052 cycles # 2.624 GHz - 255,741,591 instructions # 2.13 insn per cycle - 0.046174431 seconds time elapsed + 120,402,247 cycles # 2.621 GHz + 255,771,436 instructions # 2.12 insn per cycle + 0.046482789 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8181) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.613988e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.621406e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.621406e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.577119e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.592040e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.592040e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037041 sec +TOTAL : 0.037809 sec INFO: No Floating Point Exceptions have been reported - 106,534,081 cycles # 2.639 GHz - 235,917,118 instructions # 2.21 insn per cycle - 0.041041363 seconds time elapsed + 106,943,960 cycles # 2.590 GHz + 235,812,455 instructions # 2.21 insn per cycle + 0.041801607 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7301) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.167962e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.172897e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.172897e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.052569e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.057306e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057306e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.049548 sec +TOTAL : 0.054678 sec INFO: No Floating Point Exceptions have been reported - 94,554,513 cycles # 1.786 GHz - 133,899,064 instructions # 1.42 insn per cycle - 0.053428613 seconds time elapsed + 95,978,810 cycles # 1.760 GHz + 134,249,554 instructions # 1.40 insn per cycle + 0.058819108 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1641) (512y: 126) (512z: 6597) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index f86e27869e..a147c96b16 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-16_15:19:53 +DATE: 2024-06-02_22:05:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.541598e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.553658e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.556693e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.545160e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.557368e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.560350e+04 ) sec^-1 MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 -TOTAL : 0.467629 sec +TOTAL : 0.469580 sec INFO: No Floating Point Exceptions have been reported - 1,964,166,954 cycles # 2.815 GHz - 2,823,406,286 instructions # 1.44 insn per cycle - 0.754117473 seconds time elapsed + 1,952,036,300 cycles # 2.812 GHz + 2,839,447,182 instructions # 1.45 insn per cycle + 0.751257373 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.614317e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.731134e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.742615e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.618765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.738189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.751774e+05 ) sec^-1 MeanMatrixElemValue = ( 8.020493e-03 +- 4.025604e-03 ) GeV^-4 -TOTAL : 0.468434 sec +TOTAL : 0.470396 sec INFO: No Floating Point Exceptions have been reported - 1,946,164,211 cycles # 2.817 GHz - 2,847,399,547 instructions # 1.46 insn per cycle - 0.748191861 seconds time elapsed + 1,933,532,708 cycles # 2.817 GHz + 2,825,179,750 instructions # 1.46 insn per cycle + 0.744816373 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.448019e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.451516e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.451516e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.450156e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.453792e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.453792e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.158474 sec +TOTAL : 0.158288 sec INFO: No Floating Point Exceptions have been reported - 461,638,972 cycles # 2.852 GHz - 1,393,493,000 instructions # 3.02 insn per cycle - 0.162490485 seconds time elapsed + 461,148,667 cycles # 2.852 GHz + 1,393,475,309 instructions # 3.02 insn per cycle + 0.162260134 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3070) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.201120e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.205395e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205395e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.199587e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.203982e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.203982e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.048902 sec +TOTAL : 0.048879 sec INFO: No Floating Point Exceptions have been reported - 138,099,810 cycles # 2.644 GHz - 375,723,801 instructions # 2.72 insn per cycle - 0.052805368 seconds time elapsed + 138,617,500 cycles # 2.649 GHz + 375,838,324 instructions # 2.71 insn per cycle + 0.052819431 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:10134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.699468e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.721720e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.721720e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.585276e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.607346e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.607346e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.024500 sec +TOTAL : 0.025535 sec INFO: No Floating Point Exceptions have been reported - 72,431,086 cycles # 2.595 GHz - 146,734,646 instructions # 2.03 insn per cycle - 0.028413255 seconds time elapsed + 73,091,500 cycles # 2.523 GHz + 146,753,019 instructions # 2.01 insn per cycle + 0.029504478 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8933) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.950281e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.979563e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.979563e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061402e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.092590e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.092590e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.023199 sec +TOTAL : 0.022191 sec INFO: No Floating Point Exceptions have been reported - 67,511,576 cycles # 2.517 GHz - 136,466,222 instructions # 2.02 insn per cycle - 0.027372188 seconds time elapsed + 67,057,729 cycles # 2.606 GHz + 136,530,201 instructions # 2.04 insn per cycle + 0.026240821 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8164) (512y: 28) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.260359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.280493e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.280493e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.320153e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.340741e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.340741e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.028479 sec +TOTAL : 0.027976 sec INFO: No Floating Point Exceptions have been reported - 59,124,236 cycles # 1.860 GHz - 85,286,285 instructions # 1.44 insn per cycle - 0.032355670 seconds time elapsed + 59,523,378 cycles # 1.900 GHz + 85,246,359 instructions # 1.43 insn per cycle + 0.031991723 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2572) (512y: 32) (512z: 6935) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index 2af7dd76f9..6d3597262c 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-16_15:20:03 +DATE: 2024-06-02_22:05:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.561126e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.572400e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.575387e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.556636e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.568185e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.572444e+04 ) sec^-1 MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 -TOTAL : 0.469592 sec +TOTAL : 0.472720 sec INFO: No Floating Point Exceptions have been reported - 1,933,901,131 cycles # 2.816 GHz - 2,803,636,036 instructions # 1.45 insn per cycle - 0.744726293 seconds time elapsed + 1,931,223,021 cycles # 2.806 GHz + 2,809,227,604 instructions # 1.45 insn per cycle + 0.747270154 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.901730e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.003706e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.005157e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.951614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.008354e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.009763e+06 ) sec^-1 MeanMatrixElemValue = ( 8.020495e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.471774 sec +TOTAL : 0.469854 sec INFO: No Floating Point Exceptions have been reported - 1,934,886,385 cycles # 2.815 GHz - 2,830,776,229 instructions # 1.46 insn per cycle - 0.746474254 seconds time elapsed + 1,934,807,792 cycles # 2.816 GHz + 2,823,952,331 instructions # 1.46 insn per cycle + 0.744383978 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.452227e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.455705e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.455705e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.440467e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.444153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.444153e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.157329 sec +TOTAL : 0.157850 sec INFO: No Floating Point Exceptions have been reported - 458,573,657 cycles # 2.854 GHz - 1,388,574,447 instructions # 3.03 insn per cycle - 0.161242660 seconds time elapsed + 459,198,020 cycles # 2.845 GHz + 1,388,550,014 instructions # 3.02 insn per cycle + 0.161959627 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.204538e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.208976e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.208976e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.193461e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.198367e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.198367e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.047932 sec +TOTAL : 0.048387 sec INFO: No Floating Point Exceptions have been reported - 136,097,535 cycles # 2.652 GHz - 371,027,952 instructions # 2.73 insn per cycle - 0.051946079 seconds time elapsed + 136,709,201 cycles # 2.638 GHz + 370,998,148 instructions # 2.71 insn per cycle + 0.052404685 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:10117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.559391e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.580217e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.580217e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.706186e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.728329e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.728329e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.024960 sec +TOTAL : 0.023794 sec INFO: No Floating Point Exceptions have been reported - 71,167,021 cycles # 2.517 GHz - 142,031,155 instructions # 2.00 insn per cycle - 0.028974311 seconds time elapsed + 70,529,419 cycles # 2.593 GHz + 141,874,277 instructions # 2.01 insn per cycle + 0.027836206 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8887) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.102195e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.131341e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.131341e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.065896e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.094176e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094176e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.021142 sec +TOTAL : 0.022167 sec INFO: No Floating Point Exceptions have been reported - 63,906,261 cycles # 2.611 GHz - 131,729,034 instructions # 2.06 insn per cycle - 0.025029577 seconds time elapsed + 65,090,187 cycles # 2.575 GHz + 131,753,137 instructions # 2.02 insn per cycle + 0.026221124 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8117) (512y: 28) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.321655e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.342179e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.342179e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.325267e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345484e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345484e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.027241 sec +TOTAL : 0.027163 sec INFO: No Floating Point Exceptions have been reported - 57,621,926 cycles # 1.879 GHz - 80,488,160 instructions # 1.40 insn per cycle - 0.031258526 seconds time elapsed + 57,486,714 cycles # 1.879 GHz + 80,476,258 instructions # 1.40 insn per cycle + 0.031188983 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 32) (512z: 6939) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 16ac12981a..f1bf8ae1ae 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-16_15:20:13 +DATE: 2024-06-02_22:05:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.172533e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.195464e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.199217e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.186832e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.210139e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.214000e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.467059 sec +TOTAL : 0.466969 sec INFO: No Floating Point Exceptions have been reported - 1,929,783,722 cycles # 2.812 GHz - 2,830,067,082 instructions # 1.47 insn per cycle - 0.744348567 seconds time elapsed + 1,929,963,905 cycles # 2.810 GHz + 2,820,321,257 instructions # 1.46 insn per cycle + 0.744660265 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.817494e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.954472e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.963776e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.774215e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.915971e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.925293e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.485210 sec +TOTAL : 0.481813 sec INFO: No Floating Point Exceptions have been reported - 1,989,265,248 cycles # 2.816 GHz - 2,972,405,087 instructions # 1.49 insn per cycle - 0.764721680 seconds time elapsed + 2,018,144,507 cycles # 2.824 GHz + 2,988,593,572 instructions # 1.48 insn per cycle + 0.771128476 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.312127e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.315249e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.315249e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.318424e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.321576e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.321576e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.164695 sec +TOTAL : 0.164181 sec INFO: No Floating Point Exceptions have been reported - 479,517,658 cycles # 2.854 GHz - 1,405,303,424 instructions # 2.93 insn per cycle - 0.168655160 seconds time elapsed + 478,545,974 cycles # 2.854 GHz + 1,405,298,148 instructions # 2.94 insn per cycle + 0.168196808 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3977) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.589174e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.601629e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.601629e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.576930e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.588828e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.588828e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.085009 sec +TOTAL : 0.085353 sec INFO: No Floating Point Exceptions have been reported - 242,672,694 cycles # 2.748 GHz - 691,102,866 instructions # 2.85 insn per cycle - 0.088915527 seconds time elapsed + 242,856,261 cycles # 2.738 GHz + 691,007,271 instructions # 2.85 insn per cycle + 0.089392731 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9324) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.402863e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.409241e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.409241e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.350712e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.355921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.355921e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.042757 sec +TOTAL : 0.044021 sec INFO: No Floating Point Exceptions have been reported - 119,836,607 cycles # 2.596 GHz - 257,882,084 instructions # 2.15 insn per cycle - 0.046733316 seconds time elapsed + 120,750,481 cycles # 2.554 GHz + 257,896,528 instructions # 2.14 insn per cycle + 0.048182370 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8244) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.611690e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.620124e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.620124e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596995e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.604840e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.604840e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037856 sec +TOTAL : 0.038244 sec INFO: No Floating Point Exceptions have been reported - 108,462,768 cycles # 2.631 GHz - 238,127,423 instructions # 2.20 insn per cycle - 0.041890123 seconds time elapsed + 108,668,422 cycles # 2.610 GHz + 238,349,934 instructions # 2.19 insn per cycle + 0.042362101 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7342) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.150674e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155466e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155466e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.152016e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156769e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.156769e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.051477 sec +TOTAL : 0.051128 sec INFO: No Floating Point Exceptions have been reported - 99,538,839 cycles # 1.810 GHz - 139,339,349 instructions # 1.40 insn per cycle - 0.055665824 seconds time elapsed + 98,593,656 cycles # 1.806 GHz + 139,368,043 instructions # 1.41 insn per cycle + 0.055141464 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1953) (512y: 122) (512z: 6323) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index 96180e8a09..1674ae1a31 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-16_15:20:23 +DATE: 2024-06-02_22:05:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.207087e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.230616e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.234507e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.211002e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.235184e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.239290e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.468179 sec +TOTAL : 0.468026 sec INFO: No Floating Point Exceptions have been reported - 1,938,727,271 cycles # 2.813 GHz - 2,835,562,501 instructions # 1.46 insn per cycle - 0.747262841 seconds time elapsed + 1,934,707,675 cycles # 2.807 GHz + 2,829,874,008 instructions # 1.46 insn per cycle + 0.746355889 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.924846e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.065621e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.075056e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.925457e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.068678e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.078821e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.482793 sec +TOTAL : 0.484376 sec INFO: No Floating Point Exceptions have been reported - 2,011,507,022 cycles # 2.818 GHz - 2,962,288,052 instructions # 1.47 insn per cycle - 0.770325801 seconds time elapsed + 1,990,471,362 cycles # 2.818 GHz + 2,966,958,009 instructions # 1.49 insn per cycle + 0.763465136 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.325014e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.328184e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.328184e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.320537e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.323936e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.323936e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.163699 sec +TOTAL : 0.163600 sec INFO: No Floating Point Exceptions have been reported - 475,740,171 cycles # 2.851 GHz - 1,400,755,519 instructions # 2.94 insn per cycle - 0.167716370 seconds time elapsed + 475,927,096 cycles # 2.852 GHz + 1,400,684,973 instructions # 2.94 insn per cycle + 0.167595349 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3871) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -124,15 +124,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.586616e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.599028e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.599028e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.590632e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.602811e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.602811e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.084604 sec +TOTAL : 0.084445 sec INFO: No Floating Point Exceptions have been reported - 242,310,895 cycles # 2.753 GHz - 687,440,781 instructions # 2.84 insn per cycle - 0.088664129 seconds time elapsed + 241,578,930 cycles # 2.751 GHz + 687,384,148 instructions # 2.85 insn per cycle + 0.088448470 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -152,15 +152,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.421509e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.427219e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.427219e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.419553e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.425604e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425604e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.041396 sec +TOTAL : 0.041668 sec INFO: No Floating Point Exceptions have been reported - 117,633,598 cycles # 2.630 GHz - 253,582,281 instructions # 2.16 insn per cycle - 0.045344869 seconds time elapsed + 118,041,093 cycles # 2.620 GHz + 253,446,942 instructions # 2.15 insn per cycle + 0.045688456 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8196) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -180,15 +180,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.533249e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.540083e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.540083e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.610064e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.617322e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.617322e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038782 sec +TOTAL : 0.037145 sec INFO: No Floating Point Exceptions have been reported - 106,121,372 cycles # 2.518 GHz - 233,883,831 instructions # 2.20 insn per cycle - 0.042791740 seconds time elapsed + 106,387,352 cycles # 2.623 GHz + 233,796,871 instructions # 2.20 insn per cycle + 0.041248038 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7292) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -208,15 +208,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.148151e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.152898e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.152898e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.153901e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158908e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.158908e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.050273 sec +TOTAL : 0.050243 sec INFO: No Floating Point Exceptions have been reported - 95,562,086 cycles # 1.781 GHz - 134,760,547 instructions # 1.41 insn per cycle - 0.054201969 seconds time elapsed + 96,297,242 cycles # 1.793 GHz + 134,709,358 instructions # 1.40 insn per cycle + 0.054323733 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 122) (512z: 6323) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 15f8e8659d..c46a8918fe 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-16_15:18:21 +DATE: 2024-06-02_22:03:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.830621e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.798641e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.407520e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.623490e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.780511e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.419250e+08 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.518896 sec +TOTAL : 0.527999 sec INFO: No Floating Point Exceptions have been reported - 2,130,015,467 cycles # 2.824 GHz - 3,049,782,764 instructions # 1.43 insn per cycle - 0.811167083 seconds time elapsed + 2,127,703,778 cycles # 2.804 GHz + 3,037,503,741 instructions # 1.43 insn per cycle + 0.819938887 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.652167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.115593e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.115593e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.629121e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112830e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112830e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.200987 sec +TOTAL : 1.204706 sec INFO: No Floating Point Exceptions have been reported - 3,451,141,340 cycles # 2.863 GHz - 8,714,346,508 instructions # 2.53 insn per cycle - 1.206502072 seconds time elapsed + 3,459,527,100 cycles # 2.861 GHz + 8,713,936,767 instructions # 2.52 insn per cycle + 1.210090686 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 458) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.615216e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.136998e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.136998e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.614799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.138803e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.138803e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.764589 sec +TOTAL : 0.766288 sec INFO: No Floating Point Exceptions have been reported - 2,197,801,743 cycles # 2.856 GHz - 5,465,338,789 instructions # 2.49 insn per cycle - 0.770190206 seconds time elapsed + 2,201,462,395 cycles # 2.855 GHz + 5,464,414,082 instructions # 2.48 insn per cycle + 0.771678284 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1298) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.276018e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.408168e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.408168e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.243332e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.350818e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.350818e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.576218 sec +TOTAL : 0.584870 sec INFO: No Floating Point Exceptions have been reported - 1,593,709,911 cycles # 2.743 GHz - 3,182,241,147 instructions # 2.00 insn per cycle - 0.581747530 seconds time elapsed + 1,605,862,312 cycles # 2.723 GHz + 3,180,962,176 instructions # 1.98 insn per cycle + 0.590347744 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.349428e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.560869e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.560869e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328694e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.520640e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520640e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.561533 sec +TOTAL : 0.567215 sec INFO: No Floating Point Exceptions have been reported - 1,552,006,209 cycles # 2.741 GHz - 3,083,871,547 instructions # 1.99 insn per cycle - 0.567100846 seconds time elapsed + 1,559,045,940 cycles # 2.726 GHz + 3,082,232,081 instructions # 1.98 insn per cycle + 0.572568359 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1274) (512y: 95) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.103380e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.012957e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.012957e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.106594e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.015389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.015389e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.614313 sec +TOTAL : 0.614754 sec INFO: No Floating Point Exceptions have been reported - 1,344,567,311 cycles # 2.171 GHz - 2,376,857,450 instructions # 1.77 insn per cycle - 0.619905839 seconds time elapsed + 1,347,809,988 cycles # 2.176 GHz + 2,375,607,493 instructions # 1.76 insn per cycle + 0.620258555 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 584) (512y: 62) (512z: 953) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index 6add239f16..e5d6236670 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-16_15:18:33 +DATE: 2024-06-02_22:03:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.948407e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.328423e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.761410e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.746443e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.322888e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.759689e+08 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.519601 sec +TOTAL : 0.521311 sec INFO: No Floating Point Exceptions have been reported - 2,123,926,879 cycles # 2.815 GHz - 2,991,717,095 instructions # 1.41 insn per cycle - 0.811782941 seconds time elapsed + 2,128,407,529 cycles # 2.814 GHz + 3,052,405,520 instructions # 1.43 insn per cycle + 0.813389325 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.686449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.122021e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.122021e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.688675e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.121548e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.121548e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.196252 sec +TOTAL : 1.196984 sec INFO: No Floating Point Exceptions have been reported - 3,435,810,217 cycles # 2.862 GHz - 8,629,255,980 instructions # 2.51 insn per cycle - 1.201785163 seconds time elapsed + 3,435,642,752 cycles # 2.859 GHz + 8,628,896,472 instructions # 2.51 insn per cycle + 1.202405155 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 403) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.590372e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.090308e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.090308e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.637775e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.172160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.172160e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.773787 sec +TOTAL : 0.755876 sec INFO: No Floating Point Exceptions have been reported - 2,172,281,754 cycles # 2.790 GHz - 5,399,686,889 instructions # 2.49 insn per cycle - 0.779398624 seconds time elapsed + 2,176,531,869 cycles # 2.862 GHz + 5,398,906,105 instructions # 2.48 insn per cycle + 0.761260827 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1258) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.283822e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.420214e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.420214e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.236743e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.324010e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.324010e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.573489 sec +TOTAL : 0.584681 sec INFO: No Floating Point Exceptions have been reported - 1,585,769,603 cycles # 2.741 GHz - 3,149,146,191 instructions # 1.99 insn per cycle - 0.579182812 seconds time elapsed + 1,593,673,714 cycles # 2.704 GHz + 3,147,296,381 instructions # 1.97 insn per cycle + 0.590069472 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1386) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.354137e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.604902e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.604902e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.274934e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.428395e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.428395e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.559958 sec +TOTAL : 0.577971 sec INFO: No Floating Point Exceptions have been reported - 1,547,131,577 cycles # 2.739 GHz - 3,062,437,995 instructions # 1.98 insn per cycle - 0.565482274 seconds time elapsed + 1,554,970,499 cycles # 2.667 GHz + 3,061,298,117 instructions # 1.97 insn per cycle + 0.583755416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1220) (512y: 95) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.108481e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.023241e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.023241e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.106748e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.027961e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.027961e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.612188 sec +TOTAL : 0.613683 sec INFO: No Floating Point Exceptions have been reported - 1,354,565,413 cycles # 2.195 GHz - 2,362,076,089 instructions # 1.74 insn per cycle - 0.617754113 seconds time elapsed + 1,361,641,398 cycles # 2.201 GHz + 2,360,583,231 instructions # 1.73 insn per cycle + 0.619282503 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 557) (512y: 62) (512z: 944) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 35b822f8f6..e5e2512c5d 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-16_15:18:45 +DATE: 2024-06-02_22:03:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.370205e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202282e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.219119e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.267825e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155988e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.256672e+09 ) sec^-1 MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 -TOTAL : 0.481970 sec +TOTAL : 0.482081 sec INFO: No Floating Point Exceptions have been reported - 1,992,725,828 cycles # 2.818 GHz - 2,868,294,521 instructions # 1.44 insn per cycle - 0.764321619 seconds time elapsed + 2,007,174,940 cycles # 2.814 GHz + 2,862,530,986 instructions # 1.43 insn per cycle + 0.770265614 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 72 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.685625e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.126627e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.126627e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.657361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.123963e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.123963e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.173857 sec +TOTAL : 1.176818 sec INFO: No Floating Point Exceptions have been reported - 3,371,653,633 cycles # 2.862 GHz - 8,663,374,999 instructions # 2.57 insn per cycle - 1.179087797 seconds time elapsed + 3,379,168,439 cycles # 2.861 GHz + 8,663,925,346 instructions # 2.56 insn per cycle + 1.182131834 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 464) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.242831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.476100e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.476100e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.286459e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.570924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570924e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.559869 sec +TOTAL : 0.551800 sec INFO: No Floating Point Exceptions have been reported - 1,544,628,517 cycles # 2.742 GHz - 3,687,558,281 instructions # 2.39 insn per cycle - 0.565253973 seconds time elapsed + 1,548,820,259 cycles # 2.783 GHz + 3,686,997,614 instructions # 2.38 insn per cycle + 0.557181964 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.072720e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.536969e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.536969e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.038982e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.429005e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.429005e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.431765 sec +TOTAL : 0.435959 sec INFO: No Floating Point Exceptions have been reported - 1,203,780,059 cycles # 2.758 GHz - 2,425,738,448 instructions # 2.02 insn per cycle - 0.436956710 seconds time elapsed + 1,208,731,547 cycles # 2.744 GHz + 2,424,737,625 instructions # 2.01 insn per cycle + 0.441028354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1835) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.171115e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.846212e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.846212e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.104774e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.649197e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.649197e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.420853 sec +TOTAL : 0.429216 sec INFO: No Floating Point Exceptions have been reported - 1,176,016,394 cycles # 2.764 GHz - 2,371,904,468 instructions # 2.02 insn per cycle - 0.426173333 seconds time elapsed + 1,186,032,707 cycles # 2.731 GHz + 2,375,887,228 instructions # 2.00 insn per cycle + 0.434797682 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.877260e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.908000e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.908000e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.873089e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.905822e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.905822e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.456855 sec +TOTAL : 0.457639 sec INFO: No Floating Point Exceptions have been reported - 1,057,659,631 cycles # 2.291 GHz - 2,045,594,279 instructions # 1.93 insn per cycle - 0.462305299 seconds time elapsed + 1,058,281,507 cycles # 2.289 GHz + 2,045,070,071 instructions # 1.93 insn per cycle + 0.462906421 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1125) (512y: 5) (512z: 1216) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index 7aff49b16c..ac0cd4f08e 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-16_15:18:56 +DATE: 2024-06-02_22:04:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.371360e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210950e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.256375e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.274861e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.207280e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.264572e+09 ) sec^-1 MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 -TOTAL : 0.480672 sec +TOTAL : 0.483176 sec INFO: No Floating Point Exceptions have been reported - 1,992,055,315 cycles # 2.814 GHz - 2,833,598,547 instructions # 1.42 insn per cycle - 0.764848194 seconds time elapsed + 1,992,294,984 cycles # 2.814 GHz + 2,821,532,343 instructions # 1.42 insn per cycle + 0.766539932 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 71 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.763702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.137508e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.137508e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.761895e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.138382e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138382e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.163446 sec +TOTAL : 1.164683 sec INFO: No Floating Point Exceptions have been reported - 3,338,476,373 cycles # 2.858 GHz - 8,537,550,948 instructions # 2.56 insn per cycle - 1.168736395 seconds time elapsed + 3,347,740,263 cycles # 2.864 GHz + 8,536,643,122 instructions # 2.55 insn per cycle + 1.169926925 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 372) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.260122e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.497908e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497908e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.377967e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.780801e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.780801e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.555036 sec +TOTAL : 0.533084 sec INFO: No Floating Point Exceptions have been reported - 1,536,047,057 cycles # 2.745 GHz - 3,655,155,421 instructions # 2.38 insn per cycle - 0.560267212 seconds time elapsed + 1,538,945,489 cycles # 2.861 GHz + 3,654,064,050 instructions # 2.37 insn per cycle + 0.538506706 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1417) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.063874e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.501699e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.501699e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061693e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.506473e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.506473e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.432903 sec +TOTAL : 0.433720 sec INFO: No Floating Point Exceptions have been reported - 1,210,141,290 cycles # 2.765 GHz - 2,409,755,736 instructions # 1.99 insn per cycle - 0.438252635 seconds time elapsed + 1,212,102,221 cycles # 2.764 GHz + 2,408,536,618 instructions # 1.99 insn per cycle + 0.439199966 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1739) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.166764e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.861571e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.861571e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.151109e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.798793e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.798793e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.420903 sec +TOTAL : 0.423301 sec INFO: No Floating Point Exceptions have been reported - 1,178,969,939 cycles # 2.770 GHz - 2,360,225,770 instructions # 2.00 insn per cycle - 0.426183474 seconds time elapsed + 1,183,993,010 cycles # 2.767 GHz + 2,358,621,607 instructions # 1.99 insn per cycle + 0.428753176 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1639) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.911284e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.009343e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.009343e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.903396e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.993626e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.993626e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.450059 sec +TOTAL : 0.452229 sec INFO: No Floating Point Exceptions have been reported - 1,050,992,336 cycles # 2.312 GHz - 2,030,439,704 instructions # 1.93 insn per cycle - 0.455402836 seconds time elapsed + 1,057,097,121 cycles # 2.313 GHz + 2,029,647,637 instructions # 1.92 insn per cycle + 0.457646372 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1038) (512y: 5) (512z: 1206) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index abe970d6c3..175afd95a7 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-16_15:19:08 +DATE: 2024-06-02_22:04:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.820532e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.774843e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.362520e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.031843e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.757380e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.369019e+08 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.522135 sec +TOTAL : 0.522857 sec INFO: No Floating Point Exceptions have been reported - 2,125,526,304 cycles # 2.816 GHz - 3,031,609,259 instructions # 1.43 insn per cycle - 0.813775431 seconds time elapsed + 2,138,931,604 cycles # 2.823 GHz + 3,065,086,151 instructions # 1.43 insn per cycle + 0.815055642 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.477506e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.093135e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.465539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.090638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.090638e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.221347 sec +TOTAL : 1.223589 sec INFO: No Floating Point Exceptions have been reported - 3,505,104,547 cycles # 2.859 GHz - 8,781,502,817 instructions # 2.51 insn per cycle - 1.226777715 seconds time elapsed + 3,510,333,340 cycles # 2.858 GHz + 8,780,325,111 instructions # 2.50 insn per cycle + 1.229177242 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.650256e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201424e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201424e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.643831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.197785e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.197785e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.750812 sec +TOTAL : 0.754390 sec INFO: No Floating Point Exceptions have been reported - 2,158,593,065 cycles # 2.858 GHz - 5,461,970,761 instructions # 2.53 insn per cycle - 0.756427517 seconds time elapsed + 2,172,612,390 cycles # 2.861 GHz + 5,461,989,505 instructions # 2.51 insn per cycle + 0.760002313 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1315) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.173052e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.222124e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.222124e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.213379e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.275644e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.275644e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.600946 sec +TOTAL : 0.591323 sec INFO: No Floating Point Exceptions have been reported - 1,584,857,703 cycles # 2.630 GHz - 3,130,453,718 instructions # 1.98 insn per cycle - 0.606559761 seconds time elapsed + 1,584,687,799 cycles # 2.657 GHz + 3,128,023,138 instructions # 1.97 insn per cycle + 0.597064372 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1508) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.444228e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.788523e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.788523e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.378838e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.665981e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.665981e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.544040 sec +TOTAL : 0.557636 sec INFO: No Floating Point Exceptions have been reported - 1,507,653,377 cycles # 2.746 GHz - 2,979,978,086 instructions # 1.98 insn per cycle - 0.549733637 seconds time elapsed + 1,515,882,226 cycles # 2.694 GHz + 2,979,109,420 instructions # 1.97 insn per cycle + 0.563386258 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1266) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.159766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.131056e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.131056e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.142290e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.104661e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.104661e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.601738 sec +TOTAL : 0.607146 sec INFO: No Floating Point Exceptions have been reported - 1,324,343,740 cycles # 2.183 GHz - 2,317,585,809 instructions # 1.75 insn per cycle - 0.607328338 seconds time elapsed + 1,330,909,045 cycles # 2.175 GHz + 2,316,396,076 instructions # 1.74 insn per cycle + 0.612769323 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 708) (512y: 64) (512z: 1000) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 91c7a883f0..c48f15473d 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-16_15:19:20 +DATE: 2024-06-02_22:04:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.922874e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.310136e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.745093e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.249284e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.245731e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.567694e+08 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.518290 sec +TOTAL : 0.522953 sec INFO: No Floating Point Exceptions have been reported - 2,124,893,311 cycles # 2.820 GHz - 3,045,592,907 instructions # 1.43 insn per cycle - 0.810370808 seconds time elapsed + 2,139,960,069 cycles # 2.824 GHz + 3,035,488,341 instructions # 1.42 insn per cycle + 0.816127779 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.542081e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.100861e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.100861e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.565233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.104572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104572e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.212162 sec +TOTAL : 1.211740 sec INFO: No Floating Point Exceptions have been reported - 3,479,876,909 cycles # 2.860 GHz - 8,693,142,752 instructions # 2.50 insn per cycle - 1.217788949 seconds time elapsed + 3,489,404,912 cycles # 2.868 GHz + 8,691,090,951 instructions # 2.49 insn per cycle + 1.217329641 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.583309e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.076893e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.076893e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.593140e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.091329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.091329e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.776846 sec +TOTAL : 0.773508 sec INFO: No Floating Point Exceptions have been reported - 2,167,338,088 cycles # 2.773 GHz - 5,396,551,029 instructions # 2.49 insn per cycle - 0.782321373 seconds time elapsed + 2,171,763,818 cycles # 2.790 GHz + 5,395,529,961 instructions # 2.48 insn per cycle + 0.779263038 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.326845e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.550286e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.550286e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.359046e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.585537e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.585537e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.565802 sec +TOTAL : 0.560683 sec INFO: No Floating Point Exceptions have been reported - 1,565,712,129 cycles # 2.743 GHz - 3,096,211,416 instructions # 1.98 insn per cycle - 0.571442008 seconds time elapsed + 1,579,967,618 cycles # 2.793 GHz + 3,095,230,267 instructions # 1.96 insn per cycle + 0.566362230 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1403) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.453432e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.812851e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.812851e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.453478e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.809547e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.809547e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.541762 sec +TOTAL : 0.542291 sec INFO: No Floating Point Exceptions have been reported - 1,501,240,710 cycles # 2.746 GHz - 2,962,583,104 instructions # 1.97 insn per cycle - 0.547343450 seconds time elapsed + 1,503,606,383 cycles # 2.747 GHz + 2,961,368,670 instructions # 1.97 insn per cycle + 0.547945591 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1207) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.179755e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.168512e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.168512e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.154104e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.130242e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.130242e+06 ) sec^-1 MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.595795 sec +TOTAL : 0.603301 sec INFO: No Floating Point Exceptions have been reported - 1,328,066,698 cycles # 2.210 GHz - 2,301,968,914 instructions # 1.73 insn per cycle - 0.601517736 seconds time elapsed + 1,337,530,185 cycles # 2.200 GHz + 2,301,032,773 instructions # 1.72 insn per cycle + 0.608821276 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 669) (512y: 64) (512z: 987) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 685cbca5b9..279b0d02f4 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:16:05 +DATE: 2024-06-02_22:01:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.742150e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168430e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277843e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.298622e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163951e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277049e+08 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.532609 sec +TOTAL : 0.536129 sec INFO: No Floating Point Exceptions have been reported - 2,187,320,510 cycles # 2.847 GHz - 3,138,661,758 instructions # 1.43 insn per cycle - 0.825533767 seconds time elapsed + 2,173,999,771 cycles # 2.814 GHz + 3,130,541,130 instructions # 1.44 insn per cycle + 0.830459706 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.052254e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.112326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.112326e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.020488e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079496e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079496e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.217611 sec +TOTAL : 5.301250 sec INFO: No Floating Point Exceptions have been reported - 15,171,088,318 cycles # 2.905 GHz - 38,379,828,637 instructions # 2.53 insn per cycle - 5.223033411 seconds time elapsed + 15,199,764,023 cycles # 2.865 GHz + 38,382,132,016 instructions # 2.53 insn per cycle + 5.306881853 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 673) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.483453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675957e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675957e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.456929e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.646631e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.646631e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.119586 sec +TOTAL : 3.144938 sec INFO: No Floating Point Exceptions have been reported - 9,050,575,942 cycles # 2.897 GHz - 24,585,418,505 instructions # 2.72 insn per cycle - 3.125051862 seconds time elapsed + 9,021,281,744 cycles # 2.864 GHz + 24,583,412,308 instructions # 2.73 insn per cycle + 3.150495651 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.531605e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.007383e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.007383e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.307946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.757750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.757750e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.004395 sec +TOTAL : 2.087521 sec INFO: No Floating Point Exceptions have been reported - 5,470,487,475 cycles # 2.723 GHz - 11,258,117,341 instructions # 2.06 insn per cycle - 2.009874159 seconds time elapsed + 5,484,294,083 cycles # 2.622 GHz + 11,256,076,031 instructions # 2.05 insn per cycle + 2.093164240 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2379) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.034312e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.611178e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.611178e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.046359e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.624997e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.624997e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 1.846817 sec +TOTAL : 1.845597 sec INFO: No Floating Point Exceptions have been reported - 4,937,000,755 cycles # 2.666 GHz - 10,562,656,233 instructions # 2.14 insn per cycle - 1.852346867 seconds time elapsed + 4,960,542,470 cycles # 2.681 GHz + 10,562,896,493 instructions # 2.13 insn per cycle + 1.851112565 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.686069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.892849e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.892849e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.595965e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.792720e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.792720e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.955560 sec +TOTAL : 3.027661 sec INFO: No Floating Point Exceptions have been reported - 5,363,967,162 cycles # 1.812 GHz - 7,798,816,647 instructions # 1.45 insn per cycle - 2.961128813 seconds time elapsed + 5,393,320,109 cycles # 1.779 GHz + 7,799,680,893 instructions # 1.45 insn per cycle + 3.033220380 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1545) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index e33bd01ef0..c0d78783de 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:16:29 +DATE: 2024-06-02_22:01:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.734270e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167895e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277771e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.411711e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168792e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279006e+08 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531030 sec +TOTAL : 0.531618 sec INFO: No Floating Point Exceptions have been reported - 2,147,766,041 cycles # 2.808 GHz - 3,081,960,346 instructions # 1.43 insn per cycle - 0.823573588 seconds time elapsed + 2,183,799,831 cycles # 2.822 GHz + 3,094,825,077 instructions # 1.42 insn per cycle + 0.831170247 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.072347e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.133952e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.133952e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.045453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.106247e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.106247e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.167480 sec +TOTAL : 5.236847 sec INFO: No Floating Point Exceptions have been reported - 15,011,121,904 cycles # 2.902 GHz - 40,101,107,795 instructions # 2.67 insn per cycle - 5.172969591 seconds time elapsed + 15,020,999,910 cycles # 2.866 GHz + 40,099,937,559 instructions # 2.67 insn per cycle + 5.242391746 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.643871e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.853935e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.853935e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.600352e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806473e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.986462 sec +TOTAL : 3.023117 sec INFO: No Floating Point Exceptions have been reported - 8,687,902,361 cycles # 2.905 GHz - 23,671,582,038 instructions # 2.72 insn per cycle - 2.991891761 seconds time elapsed + 8,678,762,741 cycles # 2.866 GHz + 23,668,927,694 instructions # 2.73 insn per cycle + 3.028764335 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.688647e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.031946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.031946e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.855816e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.228551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.228551e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.347118 sec +TOTAL : 2.270383 sec INFO: No Floating Point Exceptions have been reported - 6,408,205,490 cycles # 2.726 GHz - 13,061,009,362 instructions # 2.04 insn per cycle - 2.352705794 seconds time elapsed + 6,094,093,478 cycles # 2.679 GHz + 13,059,046,457 instructions # 2.14 insn per cycle + 2.275971704 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2545) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.217515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.639971e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.639971e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.110330e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.520213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.520213e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.116902 sec +TOTAL : 2.162713 sec INFO: No Floating Point Exceptions have been reported - 5,786,103,959 cycles # 2.728 GHz - 12,322,398,791 instructions # 2.13 insn per cycle - 2.122365893 seconds time elapsed + 5,811,211,556 cycles # 2.681 GHz + 12,318,701,301 instructions # 2.12 insn per cycle + 2.168344172 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2092) (512y: 294) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.391355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.565589e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.565589e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.308089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.473010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.473010e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.201193 sec +TOTAL : 3.279972 sec INFO: No Floating Point Exceptions have been reported - 5,819,258,849 cycles # 1.816 GHz - 9,603,315,511 instructions # 1.65 insn per cycle - 3.206783116 seconds time elapsed + 5,822,767,374 cycles # 1.773 GHz + 9,603,130,120 instructions # 1.65 insn per cycle + 3.285517019 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1970) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index fa2404eda0..00b2a7887f 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:16:53 +DATE: 2024-06-02_22:02:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.806467e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.679043e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.988694e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.781883e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.602042e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.975899e+08 ) sec^-1 MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 -TOTAL : 0.484472 sec +TOTAL : 0.491268 sec INFO: No Floating Point Exceptions have been reported - 2,024,107,607 cycles # 2.847 GHz - 2,925,717,340 instructions # 1.45 insn per cycle - 0.767822860 seconds time elapsed + 2,009,507,618 cycles # 2.799 GHz + 2,900,729,651 instructions # 1.44 insn per cycle + 0.775023034 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.190102e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.263149e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.263149e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.165672e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.236261e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.236261e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 -TOTAL : 4.875075 sec +TOTAL : 4.931237 sec INFO: No Floating Point Exceptions have been reported - 14,157,231,167 cycles # 2.902 GHz - 38,349,372,496 instructions # 2.71 insn per cycle - 4.880360280 seconds time elapsed + 14,146,731,336 cycles # 2.866 GHz + 38,345,680,249 instructions # 2.71 insn per cycle + 4.936728571 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.893708e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.295163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.295163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.834710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.233739e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.233739e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 -TOTAL : 2.231375 sec +TOTAL : 2.259579 sec INFO: No Floating Point Exceptions have been reported - 6,474,839,888 cycles # 2.896 GHz - 15,821,273,128 instructions # 2.44 insn per cycle - 2.236825857 seconds time elapsed + 6,488,445,171 cycles # 2.865 GHz + 15,819,901,990 instructions # 2.44 insn per cycle + 2.265166416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2693) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.952001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.027533e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.027533e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.775248e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.005815e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.005815e+06 ) sec^-1 MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.258720 sec +TOTAL : 1.284026 sec INFO: No Floating Point Exceptions have been reported - 3,454,982,692 cycles # 2.735 GHz - 7,599,041,128 instructions # 2.20 insn per cycle - 1.263980564 seconds time elapsed + 3,459,365,538 cycles # 2.685 GHz + 7,598,231,538 instructions # 2.20 insn per cycle + 1.289366574 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3054) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.592851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112843e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112843e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.437805e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092647e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092647e+06 ) sec^-1 MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.180051 sec +TOTAL : 1.200609 sec INFO: No Floating Point Exceptions have been reported - 3,244,154,820 cycles # 2.739 GHz - 7,208,080,032 instructions # 2.22 insn per cycle - 1.185371954 seconds time elapsed + 3,247,417,265 cycles # 2.696 GHz + 7,207,177,396 instructions # 2.22 insn per cycle + 1.205866400 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.861599e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.601056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.601056e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.751112e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.477606e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.477606e+05 ) sec^-1 MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.616099 sec +TOTAL : 1.642510 sec INFO: No Floating Point Exceptions have been reported - 3,061,871,050 cycles # 1.890 GHz - 5,840,738,200 instructions # 1.91 insn per cycle - 1.621459577 seconds time elapsed + 3,066,183,622 cycles # 1.862 GHz + 5,839,500,735 instructions # 1.90 insn per cycle + 1.647870341 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2375) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index 17580b0829..2e0a99a1cf 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:17:12 +DATE: 2024-06-02_22:02:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.907160e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.728602e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.048441e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.478321e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.704119e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.049758e+08 ) sec^-1 MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 -TOTAL : 0.485743 sec +TOTAL : 0.485405 sec INFO: No Floating Point Exceptions have been reported - 2,023,423,533 cycles # 2.849 GHz - 2,905,255,031 instructions # 1.44 insn per cycle - 0.768600730 seconds time elapsed + 2,012,796,173 cycles # 2.826 GHz + 2,903,006,317 instructions # 1.44 insn per cycle + 0.768724930 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.168782e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.238544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.238544e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.137053e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.205479e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.205479e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 -TOTAL : 4.921731 sec +TOTAL : 4.995257 sec INFO: No Floating Point Exceptions have been reported - 14,314,886,956 cycles # 2.906 GHz - 39,834,092,366 instructions # 2.78 insn per cycle - 4.927032591 seconds time elapsed + 14,321,719,125 cycles # 2.865 GHz + 39,835,690,494 instructions # 2.78 insn per cycle + 5.000617134 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.713515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.269520e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.269520e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.647900e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.198922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.198922e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 -TOTAL : 1.922771 sec +TOTAL : 1.945695 sec INFO: No Floating Point Exceptions have been reported - 5,581,497,918 cycles # 2.896 GHz - 15,286,085,618 instructions # 2.74 insn per cycle - 1.928038449 seconds time elapsed + 5,584,746,201 cycles # 2.864 GHz + 15,284,426,800 instructions # 2.74 insn per cycle + 1.951180487 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.348339e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.987488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.987488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.237642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.855975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.855975e+05 ) sec^-1 MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.738529 sec +TOTAL : 1.769454 sec INFO: No Floating Point Exceptions have been reported - 4,748,584,350 cycles # 2.724 GHz - 9,734,762,909 instructions # 2.05 insn per cycle - 1.743720825 seconds time elapsed + 4,749,170,648 cycles # 2.677 GHz + 9,734,022,428 instructions # 2.05 insn per cycle + 1.774782238 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3707) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.524514e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.201131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.201131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.407642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.062522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.062522e+05 ) sec^-1 MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.693263 sec +TOTAL : 1.724353 sec INFO: No Floating Point Exceptions have been reported - 4,630,030,488 cycles # 2.727 GHz - 9,326,323,775 instructions # 2.01 insn per cycle - 1.698452247 seconds time elapsed + 4,623,692,148 cycles # 2.674 GHz + 9,324,388,930 instructions # 2.02 insn per cycle + 1.729730077 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3495) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.566237e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.043529e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.043529e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.457416e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.921892e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.921892e+05 ) sec^-1 MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.970968 sec +TOTAL : 2.009945 sec INFO: No Floating Point Exceptions have been reported - 3,659,262,236 cycles # 1.853 GHz - 7,035,706,161 instructions # 1.92 insn per cycle - 1.976219857 seconds time elapsed + 3,661,033,798 cycles # 1.818 GHz + 7,034,840,971 instructions # 1.92 insn per cycle + 2.015460084 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 12) (512z: 2220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index b504154b8b..ea5a9dfe42 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:17:33 +DATE: 2024-06-02_22:02:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.734753e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.166290e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275672e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.411466e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167511e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.278097e+08 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.527580 sec +TOTAL : 0.531668 sec INFO: No Floating Point Exceptions have been reported - 2,184,025,819 cycles # 2.852 GHz - 3,120,664,968 instructions # 1.43 insn per cycle - 0.822365132 seconds time elapsed + 2,181,212,654 cycles # 2.815 GHz + 3,113,262,519 instructions # 1.43 insn per cycle + 0.831954647 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.032702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.091464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.091464e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.011160e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.069301e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.069301e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.267767 sec +TOTAL : 5.324881 sec INFO: No Floating Point Exceptions have been reported - 15,275,610,730 cycles # 2.898 GHz - 38,585,204,587 instructions # 2.53 insn per cycle - 5.273127531 seconds time elapsed + 15,270,464,570 cycles # 2.866 GHz + 38,583,585,562 instructions # 2.53 insn per cycle + 5.330529959 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 677) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.478780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.672331e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.672331e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.489143e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.682455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.682455e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.124457 sec +TOTAL : 3.116860 sec INFO: No Floating Point Exceptions have been reported - 8,951,368,692 cycles # 2.862 GHz - 24,230,346,765 instructions # 2.71 insn per cycle - 3.129932357 seconds time elapsed + 8,946,095,501 cycles # 2.866 GHz + 24,230,074,231 instructions # 2.71 insn per cycle + 3.122402700 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.646169e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.144963e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.144963e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.511600e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.996636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.996636e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 1.966588 sec +TOTAL : 2.016820 sec INFO: No Floating Point Exceptions have been reported - 5,394,193,630 cycles # 2.737 GHz - 11,282,079,100 instructions # 2.09 insn per cycle - 1.972075346 seconds time elapsed + 5,398,022,181 cycles # 2.671 GHz + 11,281,135,154 instructions # 2.09 insn per cycle + 2.022588826 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2483) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.312770e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.933844e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.933844e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.136939e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.737334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.737334e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 1.769300 sec +TOTAL : 1.820797 sec INFO: No Floating Point Exceptions have been reported - 4,855,634,573 cycles # 2.737 GHz - 10,529,908,188 instructions # 2.17 insn per cycle - 1.774939787 seconds time elapsed + 4,868,000,366 cycles # 2.667 GHz + 10,530,833,141 instructions # 2.16 insn per cycle + 1.826342910 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2170) (512y: 148) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.779051e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.993953e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.993953e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.738166e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.952895e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.952895e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.883837 sec +TOTAL : 2.917099 sec INFO: No Floating Point Exceptions have been reported - 5,232,692,174 cycles # 1.812 GHz - 7,609,089,901 instructions # 1.45 insn per cycle - 2.889504238 seconds time elapsed + 5,206,834,374 cycles # 1.782 GHz + 7,607,869,413 instructions # 1.46 insn per cycle + 2.922673764 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1611) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 62b069d661..611ee95bf5 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-16_15:17:56 +DATE: 2024-06-02_22:03:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.743856e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168884e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279553e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.372214e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166063e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277281e+08 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531580 sec +TOTAL : 0.531410 sec INFO: No Floating Point Exceptions have been reported - 2,155,818,187 cycles # 2.818 GHz - 3,085,690,683 instructions # 1.43 insn per cycle - 0.823819066 seconds time elapsed + 2,165,210,077 cycles # 2.821 GHz + 3,108,583,253 instructions # 1.44 insn per cycle + 0.824896732 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.002464e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.060011e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.060011e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.998905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.057195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.057195e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.344548 sec +TOTAL : 5.355715 sec INFO: No Floating Point Exceptions have been reported - 15,331,700,326 cycles # 2.866 GHz - 40,369,778,421 instructions # 2.63 insn per cycle - 5.350011304 seconds time elapsed + 15,350,187,703 cycles # 2.864 GHz + 40,368,332,178 instructions # 2.63 insn per cycle + 5.361266689 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -109,15 +109,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.555017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.755921e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.755921e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.655753e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.869389e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.869389e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.059082 sec +TOTAL : 2.979107 sec INFO: No Floating Point Exceptions have been reported - 8,522,277,742 cycles # 2.782 GHz - 23,253,428,254 instructions # 2.73 insn per cycle - 3.064709896 seconds time elapsed + 8,538,137,981 cycles # 2.862 GHz + 23,251,495,548 instructions # 2.72 insn per cycle + 2.984645572 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2090) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -137,15 +137,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.699594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.044812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.044812e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.687752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.034396e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.034396e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.340476 sec +TOTAL : 2.348055 sec INFO: No Floating Point Exceptions have been reported - 6,239,696,903 cycles # 2.661 GHz - 12,963,096,678 instructions # 2.08 insn per cycle - 2.346005075 seconds time elapsed + 6,251,107,015 cycles # 2.657 GHz + 12,960,902,963 instructions # 2.07 insn per cycle + 2.353740392 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2668) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.032659e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.430530e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.430530e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.964987e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.353858e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.353858e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.191599 sec +TOTAL : 2.222113 sec INFO: No Floating Point Exceptions have been reported - 5,901,015,524 cycles # 2.687 GHz - 12,238,387,260 instructions # 2.07 insn per cycle - 2.197121947 seconds time elapsed + 5,918,688,699 cycles # 2.658 GHz + 12,237,201,089 instructions # 2.07 insn per cycle + 2.227737714 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2208) (512y: 296) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.554826e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.745267e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.745267e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.434413e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.612405e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.612405e+05 ) sec^-1 MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.058656 sec +TOTAL : 3.164008 sec INFO: No Floating Point Exceptions have been reported - 5,596,491,041 cycles # 1.827 GHz - 8,743,545,379 instructions # 1.56 insn per cycle - 3.064278596 seconds time elapsed + 5,604,141,468 cycles # 1.769 GHz + 8,744,053,502 instructions # 1.56 insn per cycle + 3.169616891 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1908) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe From 346c9df7b195fa61ba5850533aef3eb350694358 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 08:26:44 +0200 Subject: [PATCH 18/33] [tmad] rerun again 30 tmad tests on itscrd90 - all as expected (failures in heft #833, susy #825 #826 and ggttgg #856 for iconfig 104) STARTED AT Sun Jun 2 10:08:08 PM CEST 2024 (SM tests) ENDED(1) AT Mon Jun 3 02:38:15 AM CEST 2024 [Status=0] (BSM tests) ENDED(1) AT Mon Jun 3 02:47:18 AM CEST 2024 [Status=0] 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 164 +++--- .../log_eemumu_mad_f_inl0_hrd0.txt | 162 ++--- .../log_eemumu_mad_m_inl0_hrd0.txt | 158 ++--- .../log_ggtt_mad_d_inl0_hrd0.txt | 162 ++--- .../log_ggtt_mad_f_inl0_hrd0.txt | 160 ++--- .../log_ggtt_mad_m_inl0_hrd0.txt | 162 ++--- .../log_ggttg_mad_d_inl0_hrd0.txt | 160 ++--- .../log_ggttg_mad_f_inl0_hrd0.txt | 162 ++--- .../log_ggttg_mad_m_inl0_hrd0.txt | 162 ++--- .../log_ggttgg_mad_d_inl0_hrd0.txt | 555 ++---------------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 553 ++--------------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 555 ++---------------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 168 +++--- .../log_ggttggg_mad_f_inl0_hrd0.txt | 164 +++--- .../log_ggttggg_mad_m_inl0_hrd0.txt | 160 ++--- .../log_gqttq_mad_d_inl0_hrd0.txt | 160 ++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 158 ++--- .../log_gqttq_mad_m_inl0_hrd0.txt | 161 +++-- .../log_heftggbb_mad_d_inl0_hrd0.txt | 162 ++--- .../log_heftggbb_mad_f_inl0_hrd0.txt | 38 +- .../log_heftggbb_mad_m_inl0_hrd0.txt | 158 ++--- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 168 +++--- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 164 +++--- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 164 +++--- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 60 +- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 64 +- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 64 +- .../log_susyggtt_mad_d_inl0_hrd0.txt | 40 +- .../log_susyggtt_mad_f_inl0_hrd0.txt | 36 +- .../log_susyggtt_mad_m_inl0_hrd0.txt | 40 +- 30 files changed, 1966 insertions(+), 3278 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 41d66d8253..2606511c5e 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda - +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:25:56 +DATE: 2024-06-02_22:10:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7366s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7280s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1771s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1894s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1807s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.47E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3949s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3024s - [COUNTERS] Fortran MEs ( 1 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3993s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3064s + [COUNTERS] Fortran MEs ( 1 ) : 0.0929s for 90112 events => throughput is 9.70E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1913s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1957s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1888s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3835s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0754s for 90112 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3886s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0768s for 90112 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.192081e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221697e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.204613e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224133e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1822s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1903s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1859s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3602s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3125s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0476s for 90112 events => throughput is 1.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3675s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3196s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0478s for 90112 events => throughput is 1.88E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.953763e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934007e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.009226e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.891822e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1833s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1802s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1838s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.49E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3431s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3065s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 90112 events => throughput is 2.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3454s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0367s for 90112 events => throughput is 2.46E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.541984e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.401919e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.616899e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.574310e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1841s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1810s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1873s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.54E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3388s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3042s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0346s for 90112 events => throughput is 2.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3443s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3092s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 90112 events => throughput is 2.57E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.662866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.678445e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.883371e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.730941e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1818s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1849s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 2.02E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3474s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3044s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0430s for 90112 events => throughput is 2.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3584s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0448s for 90112 events => throughput is 2.01E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.029340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.084448e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.231218e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.211439e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.6140s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6135s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6089s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6084s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.59E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7363s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7314s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.86E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7381s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7330s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0051s for 90112 events => throughput is 1.78E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.277665e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.956984e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.916168e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914466e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.959957e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022770e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.493136e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.442801e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.970202e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.975645e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.040191e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.864611e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.002261e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.017368e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.140061e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.144056e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index c4c8099bbf..e035800d31 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -3,9 +3,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -20,8 +20,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:26:13 +DATE: 2024-06-02_22:10:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7287s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7200s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7255s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.46E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1783s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1884s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1797s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.42E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3902s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2987s - [COUNTERS] Fortran MEs ( 1 ) : 0.0916s for 90112 events => throughput is 9.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3994s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3066s + [COUNTERS] Fortran MEs ( 1 ) : 0.0928s for 90112 events => throughput is 9.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1903s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1836s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.22E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1917s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1849s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.21E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3825s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3880s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0737s for 90112 events => throughput is 1.22E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.260929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.262067e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.250210e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264285e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1813s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1787s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1837s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1810s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.04E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3288s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2997s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0290s for 90112 events => throughput is 3.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3356s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3059s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0296s for 90112 events => throughput is 3.04E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.206836e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.173804e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.334282e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.178187e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1835s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1811s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1861s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1836s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.34E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3304s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3034s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 90112 events => throughput is 3.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3359s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3083s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0276s for 90112 events => throughput is 3.26E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.522447e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322945e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386931e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.473480e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1829s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1805s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1843s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.24E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3301s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3033s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0268s for 90112 events => throughput is 3.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3366s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3092s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0274s for 90112 events => throughput is 3.29E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.528072e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.219490e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720927e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.481368e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1844s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1873s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1847s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.10E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3371s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3080s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0291s for 90112 events => throughput is 3.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3400s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3111s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0290s for 90112 events => throughput is 3.11E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.341186e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.352002e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.598530e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.429631e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.6090s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6085s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.68E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6098s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6093s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7344s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7297s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.95E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7372s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.93E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.546893e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.470249e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.804903e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.847509e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.477327e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.381501e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.060127e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.048976e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.389797e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.378693e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.251129e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.245527e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.752691e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.733078e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.481445e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.449608e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index fc86f120db..f4bdf77873 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:26:29 +DATE: 2024-06-02_22:11:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7237s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7153s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7352s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7265s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.43E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1861s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1773s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1891s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1804s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.49E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.4045s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3124s - [COUNTERS] Fortran MEs ( 1 ) : 0.0920s for 90112 events => throughput is 9.79E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3984s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3056s + [COUNTERS] Fortran MEs ( 1 ) : 0.0928s for 90112 events => throughput is 9.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1967s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1895s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2031s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1958s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3901s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3110s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 90112 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3908s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3121s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0787s for 90112 events => throughput is 1.14E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.191141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180947e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.200935e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192933e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1815s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1846s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.95E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3486s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3030s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0456s for 90112 events => throughput is 1.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3558s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3096s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 90112 events => throughput is 1.95E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.010123e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003834e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.071657e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.101928e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1815s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.49E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1830s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.39E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3402s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3039s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 90112 events => throughput is 2.49E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3474s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3105s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 90112 events => throughput is 2.45E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.462700e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570225e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.639506e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.646601e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1850s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1873s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.56E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3431s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3079s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 90112 events => throughput is 2.56E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3437s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0349s for 90112 events => throughput is 2.58E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.637002e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.503351e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.800572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.664209e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1828s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1890s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1851s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.09E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3487s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3069s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0418s for 90112 events => throughput is 2.16E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3556s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3129s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 90112 events => throughput is 2.11E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.197743e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.068822e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.278261e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.271696e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.6095s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6090s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6088s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6083s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.356139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.813912e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.953546e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.921066e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.960740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.979130e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.522141e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.509503e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.009432e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.019788e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.090602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.083255e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.972046e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.995167e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.157381e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.161218e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index e1be7813b6..df1862c6e5 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=cuda - +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:26:45 +DATE: 2024-06-02_22:11:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7787s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8312s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7874s + [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4146s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3704s - [COUNTERS] Fortran MEs ( 1 ) : 0.0442s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4145s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3707s + [COUNTERS] Fortran MEs ( 1 ) : 0.0437s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7473s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2714s - [COUNTERS] Fortran MEs ( 1 ) : 0.4758s for 90112 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7613s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2812s + [COUNTERS] Fortran MEs ( 1 ) : 0.4801s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4120s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0396s for 8192 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 8192 events => throughput is 2.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.8025s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3670s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4355s for 90112 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8094s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3736s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4358s for 90112 events => throughput is 2.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.132783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.124003e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.139840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123725e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4032s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4185s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3956s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.57E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6069s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3541s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2528s for 90112 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6287s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3727s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2560s for 90112 events => throughput is 3.52E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.613591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.603219e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.632391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.608626e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3994s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3855s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0139s for 8192 events => throughput is 5.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4087s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.71E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5056s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3497s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1559s for 90112 events => throughput is 5.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5189s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3601s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1588s for 90112 events => throughput is 5.68E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.889737e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.698027e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.921310e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.833893e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3995s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3870s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.38E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4813s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3413s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1399s for 90112 events => throughput is 6.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5009s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3580s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1429s for 90112 events => throughput is 6.31E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.578026e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.422941e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.673606e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.567554e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3991s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4251s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4029s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0222s for 8192 events => throughput is 3.69E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5933s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3549s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2384s for 90112 events => throughput is 3.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6212s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3792s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2420s for 90112 events => throughput is 3.72E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.816986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.780392e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.814285e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.785329e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.8124s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8118s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.42E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8191s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8185s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,8 +546,8 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7642s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7574s + [COUNTERS] PROGRAM TOTAL : 1.7827s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7760s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.120396e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.043198e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.622859e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.609307e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.177398e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.187148e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080565e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081029e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.172657e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.178903e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.155839e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154384e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.173872e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.194807e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.068966e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.086257e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 0b367d2d96..40923b92a9 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -3,9 +3,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:27:12 +DATE: 2024-06-02_22:11:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,8 +58,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8191s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7751s + [COUNTERS] PROGRAM TOTAL : 0.8250s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7812s [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4113s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s - [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s + [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7478s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2691s - [COUNTERS] Fortran MEs ( 1 ) : 0.4787s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7730s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2919s + [COUNTERS] Fortran MEs ( 1 ) : 0.4810s for 90112 events => throughput is 1.87E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4476s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4108s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 8192 events => throughput is 2.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4533s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4157s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0375s for 8192 events => throughput is 2.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7717s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3659s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4058s for 90112 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8031s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3895s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4136s for 90112 events => throughput is 2.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.257844e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.242461e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.292052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262275e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3878s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0157s for 8192 events => throughput is 5.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4099s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0158s for 8192 events => throughput is 5.18E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5166s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3428s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1737s for 90112 events => throughput is 5.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5387s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3642s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1745s for 90112 events => throughput is 5.16E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.218996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.184177e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.263655e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.248092e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3915s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3964s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4300s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3376s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4481s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0939s for 90112 events => throughput is 9.59E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.896073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.679138e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.940492e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.562229e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3898s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3819s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4002s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4168s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3312s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0856s for 90112 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4436s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3562s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.032975e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034170e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.050779e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073727e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3948s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0111s for 8192 events => throughput is 7.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4028s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0117s for 8192 events => throughput is 7.03E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4646s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3397s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1249s for 90112 events => throughput is 7.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4886s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3609s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1277s for 90112 events => throughput is 7.06E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.304914e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.325091e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.408593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.420865e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.8097s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8091s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.50E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8126s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7654s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7594s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 90112 events => throughput is 1.51E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0342s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0284s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0058s for 90112 events => throughput is 1.55E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.397168e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.201338e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.912682e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.880181e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.099083e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.110951e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.785250e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.785133e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.065057e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.085768e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.885291e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.869688e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.649544e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.657984e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.436840e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.441526e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 197f6200da..94bbdb8240 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,8 +1,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:27:37 +DATE: 2024-06-02_22:12:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8243s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7810s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8280s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7840s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4109s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3676s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4164s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3725s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7536s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2749s - [COUNTERS] Fortran MEs ( 1 ) : 0.4787s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7705s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2895s + [COUNTERS] Fortran MEs ( 1 ) : 0.4809s for 90112 events => throughput is 1.87E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4520s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4599s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4193s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0405s for 8192 events => throughput is 2.02E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.8139s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3740s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4399s for 90112 events => throughput is 2.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8301s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4447s for 90112 events => throughput is 2.03E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.079375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.068435e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.079476e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071645e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4154s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3930s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0224s for 8192 events => throughput is 3.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4227s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3999s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 8192 events => throughput is 3.59E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5989s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3477s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2513s for 90112 events => throughput is 3.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6239s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3717s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2522s for 90112 events => throughput is 3.57E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.535235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.641241e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.688874e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.667711e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4007s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0139s for 8192 events => throughput is 5.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.83E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5000s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3471s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1530s for 90112 events => throughput is 5.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5203s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3637s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1566s for 90112 events => throughput is 5.75E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.781843e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.707638e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.986506e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.808160e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4023s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3898s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4034s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3908s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0127s for 8192 events => throughput is 6.47E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4677s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3316s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1361s for 90112 events => throughput is 6.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5038s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3639s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1399s for 90112 events => throughput is 6.44E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.671424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.443112e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.782154e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.548414e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4150s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3944s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0206s for 8192 events => throughput is 3.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4234s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4017s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5820s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3527s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2292s for 90112 events => throughput is 3.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3746s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2339s for 90112 events => throughput is 3.85E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.998616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.920724e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.990048e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.976399e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.8154s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8148s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8145s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8139s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.41E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7703s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7635s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 90112 events => throughput is 1.32E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7903s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7836s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.143723e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.015619e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.636090e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.591542e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.182886e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.178918e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066867e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.065499e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.180722e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.176442e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.146460e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149411e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.158610e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.170597e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.030823e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.084870e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 7f0ff41464..744f7cd9e1 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,9 +1,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:28:04 +DATE: 2024-06-02_22:12:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.7020s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3669s - [COUNTERS] Fortran MEs ( 1 ) : 0.3350s for 8192 events => throughput is 2.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7183s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3761s + [COUNTERS] Fortran MEs ( 1 ) : 0.3421s for 8192 events => throughput is 2.39E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6627s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3273s - [COUNTERS] Fortran MEs ( 1 ) : 0.3354s for 8192 events => throughput is 2.44E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6762s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3345s + [COUNTERS] Fortran MEs ( 1 ) : 0.3417s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.2831s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5823s - [COUNTERS] Fortran MEs ( 1 ) : 3.7008s for 90112 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3647s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6103s + [COUNTERS] Fortran MEs ( 1 ) : 3.7545s for 90112 events => throughput is 2.40E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 1.0127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3448s for 8192 events => throughput is 2.38E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0244s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6751s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3493s for 8192 events => throughput is 2.35E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.7090s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9162s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7928s for 90112 events => throughput is 2.38E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7825s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9343s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8482s for 90112 events => throughput is 2.34E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.444412e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.407027e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.433186e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413486e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1796s for 8192 events => throughput is 4.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6925s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1814s for 8192 events => throughput is 4.52E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.7766s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7859s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9906s for 90112 events => throughput is 4.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.7969s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7715s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.0254s for 90112 events => throughput is 4.45E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.630294e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.630694e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.629345e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.633592e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5178s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4262s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0916s for 8192 events => throughput is 8.94E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5188s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4270s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0917s for 8192 events => throughput is 8.93E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7021s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6979s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0042s for 90112 events => throughput is 8.97E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6871s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6808s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0063s for 90112 events => throughput is 8.96E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.244917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.164525e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.251295e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.217748e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4953s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4129s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0824s for 8192 events => throughput is 9.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4964s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4148s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0816s for 8192 events => throughput is 1.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5931s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6918s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9014s for 90112 events => throughput is 1.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5733s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6744s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8989s for 90112 events => throughput is 1.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.033892e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034671e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.035720e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.030845e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5700s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4516s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1184s for 8192 events => throughput is 6.92E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5704s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4520s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1183s for 8192 events => throughput is 6.92E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.0137s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7172s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2965s for 90112 events => throughput is 6.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.0127s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7123s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3005s for 90112 events => throughput is 6.93E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.144179e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.991688e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.156532e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.017662e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,8 +513,8 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7726s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7672s + [COUNTERS] PROGRAM TOTAL : 0.7773s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7719s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0397s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0164s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.87E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0409s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0178s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 90112 events => throughput is 3.90E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.642318e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631318e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.930638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.120129e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.882259e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.935888e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244433e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.244627e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.893041e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.951586e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.255841e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.254745e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.907568e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.930773e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.774192e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775392e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 1a8c36aa43..ed5f9117a3 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:28:48 +DATE: 2024-06-02_22:13:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.7057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3678s - [COUNTERS] Fortran MEs ( 1 ) : 0.3379s for 8192 events => throughput is 2.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3729s + [COUNTERS] Fortran MEs ( 1 ) : 0.3414s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3295s - [COUNTERS] Fortran MEs ( 1 ) : 0.3355s for 8192 events => throughput is 2.44E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6777s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3354s + [COUNTERS] Fortran MEs ( 1 ) : 0.3423s for 8192 events => throughput is 2.39E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3113s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5981s - [COUNTERS] Fortran MEs ( 1 ) : 3.7132s for 90112 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3614s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6067s + [COUNTERS] Fortran MEs ( 1 ) : 3.7546s for 90112 events => throughput is 2.40E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9881s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6537s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3344s for 8192 events => throughput is 2.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9951s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6578s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3372s for 8192 events => throughput is 2.43E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.5951s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9082s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6869s for 90112 events => throughput is 2.44E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.6208s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9132s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7075s for 90112 events => throughput is 2.43E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.531268e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.551463e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.536337e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.494483e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4297s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0998s for 8192 events => throughput is 8.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5327s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4314s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1013s for 8192 events => throughput is 8.09E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7829s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6780s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1050s for 90112 events => throughput is 8.16E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.8030s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6873s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1157s for 90112 events => throughput is 8.08E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.418674e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.275405e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.378970e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.220360e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4226s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3763s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4268s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0469s for 8192 events => throughput is 1.75E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.1354s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6236s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5118s for 90112 events => throughput is 1.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1488s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6302s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5185s for 90112 events => throughput is 1.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.814902e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.780438e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.818224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.775796e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4122s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3704s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0417s for 8192 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4158s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3735s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0774s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6180s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4593s for 90112 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1028s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6328s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4700s for 90112 events => throughput is 1.92E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.016267e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959673e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.024583e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.972854e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4393s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3831s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0562s for 8192 events => throughput is 1.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3902s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0574s for 8192 events => throughput is 1.43E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.2433s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6150s for 90112 events => throughput is 1.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2861s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6550s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6311s for 90112 events => throughput is 1.43E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.471448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.436598e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.496104e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.456217e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7661s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7652s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7694s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7686s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.65E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0217s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0116s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 90112 events => throughput is 8.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0336s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0236s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 90112 events => throughput is 9.00E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.279804e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.286741e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.849139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.847775e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.708780e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.718457e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.376255e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.500599e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.741880e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.731481e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.526731e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.446798e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.576787e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570459e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.628936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.626329e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 06cc385635..96ad54f38a 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:29:27 +DATE: 2024-06-02_22:14:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.7042s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3668s - [COUNTERS] Fortran MEs ( 1 ) : 0.3374s for 8192 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7139s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3728s + [COUNTERS] Fortran MEs ( 1 ) : 0.3411s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6678s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3296s - [COUNTERS] Fortran MEs ( 1 ) : 0.3383s for 8192 events => throughput is 2.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s + [COUNTERS] Fortran MEs ( 1 ) : 0.3410s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3035s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5927s - [COUNTERS] Fortran MEs ( 1 ) : 3.7109s for 90112 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3605s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6049s + [COUNTERS] Fortran MEs ( 1 ) : 3.7556s for 90112 events => throughput is 2.40E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 1.0211s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3509s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0389s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6816s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3573s for 8192 events => throughput is 2.29E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.7816s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9205s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8611s for 90112 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.8404s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9370s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.9034s for 90112 events => throughput is 2.31E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.398467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.375362e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.396772e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.376875e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6812s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5038s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1774s for 8192 events => throughput is 4.62E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6879s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1785s for 8192 events => throughput is 4.59E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.6988s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7524s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9464s for 90112 events => throughput is 4.63E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.8198s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7730s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.0469s for 90112 events => throughput is 4.40E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.777911e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.714685e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.770421e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.706962e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5041s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4149s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0892s for 8192 events => throughput is 9.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0904s for 8192 events => throughput is 9.06E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6526s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6680s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9846s for 90112 events => throughput is 9.15E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6914s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6873s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0040s for 90112 events => throughput is 8.98E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.374488e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.227515e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.304457e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.179808e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4867s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4079s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4935s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0814s for 8192 events => throughput is 1.01E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5251s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6551s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8699s for 90112 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5536s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6688s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8848s for 90112 events => throughput is 1.02E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.072957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050974e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.074127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057681e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5672s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4475s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1197s for 8192 events => throughput is 6.85E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5760s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1219s for 8192 events => throughput is 6.72E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.0147s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7041s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3105s for 90112 events => throughput is 6.88E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.0588s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7176s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3412s for 90112 events => throughput is 6.72E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.810756e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.837164e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.935663e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.867816e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,8 +513,8 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7757s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7703s + [COUNTERS] PROGRAM TOTAL : 0.7767s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7713s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0407s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0176s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 90112 events => throughput is 3.91E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0439s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0209s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 90112 events => throughput is 3.92E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.631069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.640467e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.120692e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.045816e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.856212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.873716e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.234939e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.232563e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.866138e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.885545e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243613e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.243680e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.862499e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.866976e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.731505e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723701e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 744dd47e66..c981799588 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -2,21 +2,21 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cpp512y make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:30:11 +DATE: 2024-06-02_22:14:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -47,20 +47,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.8074s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3646s - [COUNTERS] Fortran MEs ( 1 ) : 4.4427s for 8192 events => throughput is 1.84E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 + [UNWEIGHT] Wrote 11 events (found 187 events) + [COUNTERS] PROGRAM TOTAL : 4.6980s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2611s + [COUNTERS] Fortran MEs ( 1 ) : 4.4369s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,20 +72,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.8076s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3603s - [COUNTERS] Fortran MEs ( 1 ) : 4.4473s for 8192 events => throughput is 1.84E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 4.6906s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s + [COUNTERS] Fortran MEs ( 1 ) : 4.4335s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,20 +97,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 50.3676s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1555s - [COUNTERS] Fortran MEs ( 1 ) : 48.2121s for 90112 events => throughput is 1.87E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 + [UNWEIGHT] Wrote 18 events (found 294 events) + [COUNTERS] PROGRAM TOTAL : 50.6666s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8936s + [COUNTERS] Fortran MEs ( 1 ) : 48.7731s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,482 +122,45 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 9.2196s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7253s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4943s for 8192 events => throughput is 1.82E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222236] fbridge_mode=1 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 9.2716s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7071s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.5645s for 8192 events => throughput is 1.79E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.46320556621222242) and cpp (0.46320556621222236) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 56.2683s - [COUNTERS] Fortran Overhead ( 0 ) : 6.5201s - [COUNTERS] CudaCpp MEs ( 2 ) : 49.7482s for 90112 events => throughput is 1.81E+03 events/s - -*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (5.551115123125783e-16) - -*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.868635e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.874481e+03 ) sec^-1 - -*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 5.0549s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6552s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3997s for 8192 events => throughput is 3.41E+03 events/s - -*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 30.7729s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4162s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.3568s for 90112 events => throughput is 3.42E+03 events/s - -*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (6.661338147750939e-16) - -*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.598914e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.606768e+03 ) sec^-1 - -*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.4142s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3728s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0413s for 8192 events => throughput is 7.87E+03 events/s - -*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 14.6025s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1268s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.4757s for 90112 events => throughput is 7.85E+03 events/s - -*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) - -*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.082204e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.087591e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.1594s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2438s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9156s for 8192 events => throughput is 8.95E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.1097s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0095s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.1002s for 90112 events => throughput is 8.92E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.157056e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.190937e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7205s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5345s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1860s for 8192 events => throughput is 6.91E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 16.4021s - [COUNTERS] Fortran Overhead ( 0 ) : 3.3045s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.0976s for 90112 events => throughput is 6.88E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.943395e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.968493e+03 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9037s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cuda (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.9957s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6318s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s - -*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cuda (1.5793438642451712E-004) differ by less than 3E-14 (0.0) - -*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.275863e+05 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.513394e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.126700e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.163753e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.128674e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.183392e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.130320e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.455396e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** - -TEST COMPLETED +ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! +diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 +6,8c6,8 +< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +--- +> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +20c20 +< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +--- +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +22c22 +< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +--- +> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +24c24 +< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. +--- +> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 97726609cd..e5afb01bb6 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:34:39 +DATE: 2024-06-02_22:16:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -47,20 +47,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.7506s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3591s - [COUNTERS] Fortran MEs ( 1 ) : 4.3916s for 8192 events => throughput is 1.87E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 + [UNWEIGHT] Wrote 11 events (found 187 events) + [COUNTERS] PROGRAM TOTAL : 4.6224s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s + [COUNTERS] Fortran MEs ( 1 ) : 4.3645s for 8192 events => throughput is 1.88E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,20 +72,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7327s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3562s - [COUNTERS] Fortran MEs ( 1 ) : 4.3765s for 8192 events => throughput is 1.87E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 4.6165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2540s + [COUNTERS] Fortran MEs ( 1 ) : 4.3625s for 8192 events => throughput is 1.88E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,20 +97,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 50.4568s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1771s - [COUNTERS] Fortran MEs ( 1 ) : 48.2797s for 90112 events => throughput is 1.87E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 + [UNWEIGHT] Wrote 18 events (found 294 events) + [COUNTERS] PROGRAM TOTAL : 50.5574s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8890s + [COUNTERS] Fortran MEs ( 1 ) : 48.6684s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,482 +122,45 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.9135s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5714s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3421s for 8192 events => throughput is 1.89E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320716609204404] fbridge_mode=1 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 8.9166s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5182s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3984s for 8192 events => throughput is 1.86E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703729438336302E-004) differ by less than 4E-4 (3.021119383106452e-06) +OK! xsec from fortran (0.46320556621222242) and cpp (0.46320716609204404) differ by less than 4E-4 (3.453930475627587e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 54.1190s - [COUNTERS] Fortran Overhead ( 0 ) : 6.3214s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.7976s for 90112 events => throughput is 1.89E+03 events/s - -*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004) differ by less than 4E-4 (3.0382263187522796e-06) - -*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.947180e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.947353e+03 ) sec^-1 - -*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7184s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5261s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1923s for 8192 events => throughput is 6.87E+03 events/s - -*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722581317850E-004) differ by less than 4E-4 (2.843951981690296e-06) - -*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 16.5068s - [COUNTERS] Fortran Overhead ( 0 ) : 3.3022s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.2046s for 90112 events => throughput is 6.82E+03 events/s - -*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004) differ by less than 4E-4 (2.856718252175483e-06) - -*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.978396e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.983793e+03 ) sec^-1 - -*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.4122s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8788s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5334s for 8192 events => throughput is 1.54E+04 events/s - -*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) - -*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 8.5565s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6677s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.8888s for 90112 events => throughput is 1.53E+04 events/s - -*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) - -*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.577584e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.577489e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.2916s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4723s for 8192 events => throughput is 1.73E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.7633s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5957s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.1676s for 90112 events => throughput is 1.74E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.818661e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.824534e+04 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.5274s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9418s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5856s for 8192 events => throughput is 1.40E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703728658657426E-004) differ by less than 4E-4 (3.0009745224379714e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 9.1749s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6987s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.4761s for 90112 events => throughput is 1.39E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004) differ by less than 4E-4 (3.0604373708609245e-06) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.413533e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.415193e+04 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8657s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8443s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cuda (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.8423s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6065s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2358s for 90112 events => throughput is 3.82E+05 events/s - -*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cuda (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) - -*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.583101e+05 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.931306e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.570292e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.724382e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.573894e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.720622e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.518798e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.538253e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** - -TEST COMPLETED +ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! +diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 +6,8c6,8 +< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +--- +> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +20c20 +< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +--- +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +22c22 +< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +--- +> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +24c24 +< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. +--- +> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 9161616d22..05784aaa7b 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -3,17 +3,17 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppavx2 + +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:38:10 +DATE: 2024-06-02_22:17:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -47,20 +47,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.7461s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3596s - [COUNTERS] Fortran MEs ( 1 ) : 4.3864s for 8192 events => throughput is 1.87E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 + [UNWEIGHT] Wrote 11 events (found 187 events) + [COUNTERS] PROGRAM TOTAL : 4.6921s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2633s + [COUNTERS] Fortran MEs ( 1 ) : 4.4288s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,20 +72,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7384s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3570s - [COUNTERS] Fortran MEs ( 1 ) : 4.3814s for 8192 events => throughput is 1.87E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 4.6873s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2561s + [COUNTERS] Fortran MEs ( 1 ) : 4.4311s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,20 +97,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 50.4226s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1630s - [COUNTERS] Fortran MEs ( 1 ) : 48.2596s for 90112 events => throughput is 1.87E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 + [UNWEIGHT] Wrote 18 events (found 294 events) + [COUNTERS] PROGRAM TOTAL : 50.6939s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8954s + [COUNTERS] Fortran MEs ( 1 ) : 48.7985s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,482 +122,45 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 9.4283s - [COUNTERS] Fortran Overhead ( 0 ) : 4.8187s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6095s for 8192 events => throughput is 1.78E+03 events/s + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556893412546] fbridge_mode=1 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 9.3537s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7437s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6100s for 8192 events => throughput is 1.78E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612659176674E-004) differ by less than 2E-4 (3.851690077993908e-09) +OK! xsec from fortran (0.46320556621222242) and cpp (0.46320556893412546) differ by less than 2E-4 (5.876231279344779e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 56.9207s - [COUNTERS] Fortran Overhead ( 0 ) : 6.6165s - [COUNTERS] CudaCpp MEs ( 2 ) : 50.3042s for 90112 events => throughput is 1.79E+03 events/s - -*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004) differ by less than 2E-4 (3.930950231989527e-09) - -*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.845749e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855084e+03 ) sec^-1 - -*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 5.0818s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6854s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3963s for 8192 events => throughput is 3.42E+03 events/s - -*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612692816703E-004) differ by less than 2E-4 (4.720860369289426e-09) - -*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 30.7891s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4469s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.3422s for 90112 events => throughput is 3.42E+03 events/s - -*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004) differ by less than 2E-4 (4.1013439311399225e-09) - -*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.522464e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.523204e+03 ) sec^-1 - -*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.3962s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3623s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0339s for 8192 events => throughput is 7.92E+03 events/s - -*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) - -*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 14.5036s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1328s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.3708s for 90112 events => throughput is 7.92E+03 events/s - -*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) - -*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.125066e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.074324e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.1502s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2343s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9159s for 8192 events => throughput is 8.94E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.9975s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0012s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.9962s for 90112 events => throughput is 9.01E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.351869e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.208773e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7414s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5394s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2020s for 8192 events => throughput is 6.82E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 16.5337s - [COUNTERS] Fortran Overhead ( 0 ) : 3.3003s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.2334s for 90112 events => throughput is 6.81E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.881370e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.943865e+03 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8685s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 8192 events => throughput is 2.49E+05 events/s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cuda (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.9899s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6262s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3637s for 90112 events => throughput is 2.48E+05 events/s - -*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cuda (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) - -*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.280457e+05 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523385e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.121733e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.162091e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.125438e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.168234e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.125929e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.451563e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** - -TEST COMPLETED +ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! +diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 +6,8c6,8 +< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +--- +> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +20c20 +< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +--- +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +22c22 +< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +--- +> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +24c24 +< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. +--- +> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index f87c8c9cf1..b9fb2f5206 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,13 +1,13 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' @@ -18,11 +18,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:43:56 +DATE: 2024-06-02_22:19:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 101.7235s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5049s - [COUNTERS] Fortran MEs ( 1 ) : 101.2186s for 8192 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1254s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5184s + [COUNTERS] Fortran MEs ( 1 ) : 102.6070s for 8192 events => throughput is 7.98E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 101.7703s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5109s - [COUNTERS] Fortran MEs ( 1 ) : 101.2594s for 8192 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1957s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5142s + [COUNTERS] Fortran MEs ( 1 ) : 102.6815s for 8192 events => throughput is 7.98E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1119.4272s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4297s - [COUNTERS] Fortran MEs ( 1 ) : 1114.9976s for 90112 events => throughput is 8.08E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1132.8870s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4650s + [COUNTERS] Fortran MEs ( 1 ) : 1128.4220s for 90112 events => throughput is 7.99E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 236.4243s - [COUNTERS] Fortran Overhead ( 0 ) : 108.7156s - [COUNTERS] CudaCpp MEs ( 2 ) : 127.7087s for 8192 events => throughput is 6.41E+01 events/s + [COUNTERS] PROGRAM TOTAL : 242.5730s + [COUNTERS] Fortran Overhead ( 0 ) : 110.5170s + [COUNTERS] CudaCpp MEs ( 2 ) : 132.0559s for 8192 events => throughput is 6.20E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1507.2346s - [COUNTERS] Fortran Overhead ( 0 ) : 112.2012s - [COUNTERS] CudaCpp MEs ( 2 ) : 1395.0334s for 90112 events => throughput is 6.46E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1558.9829s + [COUNTERS] Fortran Overhead ( 0 ) : 114.2344s + [COUNTERS] CudaCpp MEs ( 2 ) : 1444.7485s for 90112 events => throughput is 6.24E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.611342e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.436157e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.612518e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.453513e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 114.8253s - [COUNTERS] Fortran Overhead ( 0 ) : 52.9129s - [COUNTERS] CudaCpp MEs ( 2 ) : 61.9124s for 8192 events => throughput is 1.32E+02 events/s + [COUNTERS] PROGRAM TOTAL : 115.6810s + [COUNTERS] Fortran Overhead ( 0 ) : 53.3874s + [COUNTERS] CudaCpp MEs ( 2 ) : 62.2936s for 8192 events => throughput is 1.32E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 734.6001s - [COUNTERS] Fortran Overhead ( 0 ) : 56.8950s - [COUNTERS] CudaCpp MEs ( 2 ) : 677.7051s for 90112 events => throughput is 1.33E+02 events/s + [COUNTERS] PROGRAM TOTAL : 738.8265s + [COUNTERS] Fortran Overhead ( 0 ) : 57.1821s + [COUNTERS] CudaCpp MEs ( 2 ) : 681.6444s for 90112 events => throughput is 1.32E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.573216e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557219e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.570652e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557436e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 53.5594s - [COUNTERS] Fortran Overhead ( 0 ) : 24.8692s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.6902s for 8192 events => throughput is 2.86E+02 events/s + [COUNTERS] PROGRAM TOTAL : 54.2051s + [COUNTERS] Fortran Overhead ( 0 ) : 24.9796s + [COUNTERS] CudaCpp MEs ( 2 ) : 29.2256s for 8192 events => throughput is 2.80E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 345.8816s - [COUNTERS] Fortran Overhead ( 0 ) : 28.6165s - [COUNTERS] CudaCpp MEs ( 2 ) : 317.2651s for 90112 events => throughput is 2.84E+02 events/s + [COUNTERS] PROGRAM TOTAL : 350.6938s + [COUNTERS] Fortran Overhead ( 0 ) : 29.4308s + [COUNTERS] CudaCpp MEs ( 2 ) : 321.2629s for 90112 events => throughput is 2.80E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.346027e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.356831e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.397864e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330655e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 47.5433s - [COUNTERS] Fortran Overhead ( 0 ) : 21.7991s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.7442s for 8192 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.9010s + [COUNTERS] Fortran Overhead ( 0 ) : 22.3111s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.5899s for 8192 events => throughput is 2.97E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 310.2994s - [COUNTERS] Fortran Overhead ( 0 ) : 25.7446s - [COUNTERS] CudaCpp MEs ( 2 ) : 284.5548s for 90112 events => throughput is 3.17E+02 events/s + [COUNTERS] PROGRAM TOTAL : 317.7874s + [COUNTERS] Fortran Overhead ( 0 ) : 26.2827s + [COUNTERS] CudaCpp MEs ( 2 ) : 291.5048s for 90112 events => throughput is 3.09E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.866314e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.890746e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.859864e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.795517e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 50.4926s - [COUNTERS] Fortran Overhead ( 0 ) : 24.7479s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.7447s for 8192 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 51.3811s + [COUNTERS] Fortran Overhead ( 0 ) : 25.1741s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2071s for 8192 events => throughput is 3.13E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 313.6701s - [COUNTERS] Fortran Overhead ( 0 ) : 28.5625s - [COUNTERS] CudaCpp MEs ( 2 ) : 285.1075s for 90112 events => throughput is 3.16E+02 events/s + [COUNTERS] PROGRAM TOTAL : 320.5201s + [COUNTERS] Fortran Overhead ( 0 ) : 29.2606s + [COUNTERS] CudaCpp MEs ( 2 ) : 291.2595s for 90112 events => throughput is 3.09E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.394651e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.335685e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.384790e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330505e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 4.2708s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1879s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0829s for 8192 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2720s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1877s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0843s for 8192 events => throughput is 7.55E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 19.1407s - [COUNTERS] Fortran Overhead ( 0 ) : 7.1896s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9511s for 90112 events => throughput is 7.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 19.1951s + [COUNTERS] Fortran Overhead ( 0 ) : 7.2572s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9379s for 90112 events => throughput is 7.55E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.518899e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.550389e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.266687e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.286009e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.285867e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.250646e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.577065e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.571508e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.302340e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.277996e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.485177e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.464805e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.239249e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.271016e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.236704e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.239190e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 9938780c0a..6600eb2c20 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - - make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_03:16:41 +DATE: 2024-06-02_23:54:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 101.6786s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5071s - [COUNTERS] Fortran MEs ( 1 ) : 101.1715s for 8192 events => throughput is 8.10E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1481s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5128s + [COUNTERS] Fortran MEs ( 1 ) : 102.6352s for 8192 events => throughput is 7.98E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 102.1420s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5108s - [COUNTERS] Fortran MEs ( 1 ) : 101.6312s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.0623s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5146s + [COUNTERS] Fortran MEs ( 1 ) : 102.5477s for 8192 events => throughput is 7.99E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1119.6489s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4200s - [COUNTERS] Fortran MEs ( 1 ) : 1115.2289s for 90112 events => throughput is 8.08E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1133.8589s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4674s + [COUNTERS] Fortran MEs ( 1 ) : 1129.3915s for 90112 events => throughput is 7.98E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -134,9 +134,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 207.9761s - [COUNTERS] Fortran Overhead ( 0 ) : 95.5518s - [COUNTERS] CudaCpp MEs ( 2 ) : 112.4243s for 8192 events => throughput is 7.29E+01 events/s + [COUNTERS] PROGRAM TOTAL : 211.4040s + [COUNTERS] Fortran Overhead ( 0 ) : 97.0025s + [COUNTERS] CudaCpp MEs ( 2 ) : 114.4015s for 8192 events => throughput is 7.16E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -156,7 +156,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -168,9 +168,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1342.0233s - [COUNTERS] Fortran Overhead ( 0 ) : 99.5419s - [COUNTERS] CudaCpp MEs ( 2 ) : 1242.4814s for 90112 events => throughput is 7.25E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1358.5973s + [COUNTERS] Fortran Overhead ( 0 ) : 101.0487s + [COUNTERS] CudaCpp MEs ( 2 ) : 1257.5486s for 90112 events => throughput is 7.17E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -183,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.627892e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.461850e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.617246e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.478420e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,7 +200,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -212,9 +212,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 52.4912s - [COUNTERS] Fortran Overhead ( 0 ) : 24.8093s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.6819s for 8192 events => throughput is 2.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 53.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 25.1661s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2188s for 8192 events => throughput is 2.90E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,7 +234,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -246,9 +246,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 333.5578s - [COUNTERS] Fortran Overhead ( 0 ) : 28.7441s - [COUNTERS] CudaCpp MEs ( 2 ) : 304.8137s for 90112 events => throughput is 2.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 339.1541s + [COUNTERS] Fortran Overhead ( 0 ) : 29.1542s + [COUNTERS] CudaCpp MEs ( 2 ) : 309.9999s for 90112 events => throughput is 2.91E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -261,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.354565e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.372421e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.352919e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331584e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -290,9 +290,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 26.9353s - [COUNTERS] Fortran Overhead ( 0 ) : 12.5805s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.3549s for 8192 events => throughput is 5.71E+02 events/s + [COUNTERS] PROGRAM TOTAL : 27.8063s + [COUNTERS] Fortran Overhead ( 0 ) : 12.9789s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.8274s for 8192 events => throughput is 5.52E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,7 +312,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -324,9 +324,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 175.5244s - [COUNTERS] Fortran Overhead ( 0 ) : 16.6137s - [COUNTERS] CudaCpp MEs ( 2 ) : 158.9107s for 90112 events => throughput is 5.67E+02 events/s + [COUNTERS] PROGRAM TOTAL : 179.6098s + [COUNTERS] Fortran Overhead ( 0 ) : 16.9911s + [COUNTERS] CudaCpp MEs ( 2 ) : 162.6187s for 90112 events => throughput is 5.54E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -339,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.796719e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.562410e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.813418e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.526748e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -356,7 +356,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -368,9 +368,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 24.0035s - [COUNTERS] Fortran Overhead ( 0 ) : 11.1136s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.8899s for 8192 events => throughput is 6.36E+02 events/s + [COUNTERS] PROGRAM TOTAL : 24.3059s + [COUNTERS] Fortran Overhead ( 0 ) : 11.3922s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.9137s for 8192 events => throughput is 6.34E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -390,7 +390,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -402,9 +402,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 155.2962s - [COUNTERS] Fortran Overhead ( 0 ) : 15.0900s - [COUNTERS] CudaCpp MEs ( 2 ) : 140.2063s for 90112 events => throughput is 6.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 157.4560s + [COUNTERS] Fortran Overhead ( 0 ) : 15.2596s + [COUNTERS] CudaCpp MEs ( 2 ) : 142.1964s for 90112 events => throughput is 6.34E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -417,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.795563e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.624295e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.781198e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.660814e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -434,7 +434,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -446,9 +446,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 25.3167s - [COUNTERS] Fortran Overhead ( 0 ) : 12.6125s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.7041s for 8192 events => throughput is 6.45E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.8895s + [COUNTERS] Fortran Overhead ( 0 ) : 12.8401s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0494s for 8192 events => throughput is 6.28E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -468,7 +468,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -480,9 +480,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 157.3576s - [COUNTERS] Fortran Overhead ( 0 ) : 16.6484s - [COUNTERS] CudaCpp MEs ( 2 ) : 140.7092s for 90112 events => throughput is 6.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 161.6641s + [COUNTERS] Fortran Overhead ( 0 ) : 16.8758s + [COUNTERS] CudaCpp MEs ( 2 ) : 144.7883s for 90112 events => throughput is 6.22E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -495,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.841550e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.696273e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.830427e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.656469e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,7 +512,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -523,9 +523,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 2.5511s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0591s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4920s for 8192 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5420s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0407s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5013s for 8192 events => throughput is 1.63E+04 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,7 +545,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -556,9 +556,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 11.4353s - [COUNTERS] Fortran Overhead ( 0 ) : 6.0032s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4321s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 11.4677s + [COUNTERS] Fortran Overhead ( 0 ) : 6.0517s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4160s for 90112 events => throughput is 1.66E+04 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -571,42 +571,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.630624e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.632797e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646596e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.633172e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329013e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.316402e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.359221e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353428e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329144e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.297312e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.339287e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.399287e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.285838e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336831e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.423096e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.375180e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 9cddd5fe7c..91cbe4e948 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_04:26:13 +DATE: 2024-06-03_01:05:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 101.9892s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5028s - [COUNTERS] Fortran MEs ( 1 ) : 101.4864s for 8192 events => throughput is 8.07E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.0692s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5126s + [COUNTERS] Fortran MEs ( 1 ) : 102.5566s for 8192 events => throughput is 7.99E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 101.7400s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5055s - [COUNTERS] Fortran MEs ( 1 ) : 101.2345s for 8192 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.0859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5173s + [COUNTERS] Fortran MEs ( 1 ) : 102.5686s for 8192 events => throughput is 7.99E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1119.6356s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4254s - [COUNTERS] Fortran MEs ( 1 ) : 1115.2102s for 90112 events => throughput is 8.08E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1132.5916s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4570s + [COUNTERS] Fortran MEs ( 1 ) : 1128.1345s for 90112 events => throughput is 7.99E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 229.4020s - [COUNTERS] Fortran Overhead ( 0 ) : 103.2152s - [COUNTERS] CudaCpp MEs ( 2 ) : 126.1868s for 8192 events => throughput is 6.49E+01 events/s + [COUNTERS] PROGRAM TOTAL : 237.2700s + [COUNTERS] Fortran Overhead ( 0 ) : 112.2091s + [COUNTERS] CudaCpp MEs ( 2 ) : 125.0609s for 8192 events => throughput is 6.55E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1528.1049s - [COUNTERS] Fortran Overhead ( 0 ) : 113.9982s - [COUNTERS] CudaCpp MEs ( 2 ) : 1414.1067s for 90112 events => throughput is 6.37E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1488.3180s + [COUNTERS] Fortran Overhead ( 0 ) : 107.9254s + [COUNTERS] CudaCpp MEs ( 2 ) : 1380.3926s for 90112 events => throughput is 6.53E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.425842e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.475500e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.948869e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.435461e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 117.4242s - [COUNTERS] Fortran Overhead ( 0 ) : 53.6967s - [COUNTERS] CudaCpp MEs ( 2 ) : 63.7275s for 8192 events => throughput is 1.29E+02 events/s + [COUNTERS] PROGRAM TOTAL : 119.9634s + [COUNTERS] Fortran Overhead ( 0 ) : 54.9402s + [COUNTERS] CudaCpp MEs ( 2 ) : 65.0232s for 8192 events => throughput is 1.26E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 756.7451s - [COUNTERS] Fortran Overhead ( 0 ) : 57.7650s - [COUNTERS] CudaCpp MEs ( 2 ) : 698.9802s for 90112 events => throughput is 1.29E+02 events/s + [COUNTERS] PROGRAM TOTAL : 780.5801s + [COUNTERS] Fortran Overhead ( 0 ) : 59.5507s + [COUNTERS] CudaCpp MEs ( 2 ) : 721.0294s for 90112 events => throughput is 1.25E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.540886e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.491581e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.526888e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.498726e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 51.3666s - [COUNTERS] Fortran Overhead ( 0 ) : 23.6472s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.7194s for 8192 events => throughput is 2.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 52.6745s + [COUNTERS] Fortran Overhead ( 0 ) : 24.2901s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.3844s for 8192 events => throughput is 2.89E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 334.4015s - [COUNTERS] Fortran Overhead ( 0 ) : 27.7321s - [COUNTERS] CudaCpp MEs ( 2 ) : 306.6693s for 90112 events => throughput is 2.94E+02 events/s + [COUNTERS] PROGRAM TOTAL : 338.8982s + [COUNTERS] Fortran Overhead ( 0 ) : 28.2101s + [COUNTERS] CudaCpp MEs ( 2 ) : 310.6881s for 90112 events => throughput is 2.90E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.517938e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.451072e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.525606e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.475095e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 44.9641s - [COUNTERS] Fortran Overhead ( 0 ) : 20.5328s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.4313s for 8192 events => throughput is 3.35E+02 events/s + [COUNTERS] PROGRAM TOTAL : 46.3456s + [COUNTERS] Fortran Overhead ( 0 ) : 21.3056s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.0400s for 8192 events => throughput is 3.27E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 294.2131s - [COUNTERS] Fortran Overhead ( 0 ) : 24.5260s - [COUNTERS] CudaCpp MEs ( 2 ) : 269.6871s for 90112 events => throughput is 3.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 298.0662s + [COUNTERS] Fortran Overhead ( 0 ) : 24.8292s + [COUNTERS] CudaCpp MEs ( 2 ) : 273.2370s for 90112 events => throughput is 3.30E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.118767e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.043947e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.099496e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.044016e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 49.6272s - [COUNTERS] Fortran Overhead ( 0 ) : 23.9735s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.6537s for 8192 events => throughput is 3.19E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.3351s + [COUNTERS] Fortran Overhead ( 0 ) : 24.5435s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.7916s for 8192 events => throughput is 3.18E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 308.1385s - [COUNTERS] Fortran Overhead ( 0 ) : 28.1125s - [COUNTERS] CudaCpp MEs ( 2 ) : 280.0260s for 90112 events => throughput is 3.22E+02 events/s + [COUNTERS] PROGRAM TOTAL : 314.8535s + [COUNTERS] Fortran Overhead ( 0 ) : 28.5171s + [COUNTERS] CudaCpp MEs ( 2 ) : 286.3364s for 90112 events => throughput is 3.15E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.494217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.418688e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.498492e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.414882e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 3.6127s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7479s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8648s for 8192 events => throughput is 9.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6355s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7751s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8603s for 8192 events => throughput is 9.52E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 16.2177s - [COUNTERS] Fortran Overhead ( 0 ) : 6.7356s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4821s for 90112 events => throughput is 9.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.2537s + [COUNTERS] Fortran Overhead ( 0 ) : 6.7785s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4752s for 90112 events => throughput is 9.51E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.422089e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.440992e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.074505e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083652e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108350e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110058e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.160591e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.159066e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110190e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113035e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113742e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111683e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.112799e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.106360e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.647292e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.643131e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index c909267a2d..9ef918bb40 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:42:38 +DATE: 2024-06-02_22:18:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4817s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4067s - [COUNTERS] Fortran MEs ( 1 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4837s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4091s + [COUNTERS] Fortran MEs ( 1 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4101s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3363s - [COUNTERS] Fortran MEs ( 1 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4137s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3388s + [COUNTERS] Fortran MEs ( 1 ) : 0.0749s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3997s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5919s - [COUNTERS] Fortran MEs ( 1 ) : 0.8079s for 90112 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4333s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6135s + [COUNTERS] Fortran MEs ( 1 ) : 0.8198s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4922s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4133s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0789s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4982s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4179s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0803s for 8192 events => throughput is 1.02E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5284s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6490s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8794s for 90112 events => throughput is 1.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5615s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6764s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8851s for 90112 events => throughput is 1.02E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.038604e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.031593e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.042402e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032655e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4203s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3775s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0428s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0442s for 8192 events => throughput is 1.85E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0947s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6149s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1183s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6354s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4829s for 90112 events => throughput is 1.87E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.949373e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.899679e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.919896e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.911483e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3846s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3601s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3916s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3660s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8777s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6020s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2756s for 90112 events => throughput is 3.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9014s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6204s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2810s for 90112 events => throughput is 3.21E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.197689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.204350e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.341628e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.215560e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3821s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3591s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3630s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8414s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5939s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2475s for 90112 events => throughput is 3.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8699s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6164s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2535s for 90112 events => throughput is 3.56E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.543777e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639452e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.667139e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.651673e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4059s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3719s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 8192 events => throughput is 2.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4108s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3757s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 8192 events => throughput is 2.34E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0051s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6219s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3832s for 90112 events => throughput is 2.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0218s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3887s for 90112 events => throughput is 2.32E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.349902e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.346690e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.363260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.362776e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7762s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7756s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7749s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0116s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0036s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0080s for 90112 events => throughput is 1.13E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0315s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0234s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 90112 events => throughput is 1.11E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.582112e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.544378e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.121699e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.084339e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.529721e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.545581e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.531629e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.516553e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.538677e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.562082e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.807286e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.797868e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.530045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.544309e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.783374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.786814e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 4ac5ec3dc1..61b60dbfe4 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -4,8 +4,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_13:45:22 +DATE: 2024-06-02_22:18:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4107s - [COUNTERS] Fortran MEs ( 1 ) : 0.0749s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4838s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s + [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4161s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3412s - [COUNTERS] Fortran MEs ( 1 ) : 0.0749s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4164s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3413s + [COUNTERS] Fortran MEs ( 1 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4361s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6155s - [COUNTERS] Fortran MEs ( 1 ) : 0.8206s for 90112 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4404s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6194s + [COUNTERS] Fortran MEs ( 1 ) : 0.8210s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4907s + [COUNTERS] PROGRAM TOTAL : 0.4906s [COUNTERS] Fortran Overhead ( 0 ) : 0.4145s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0762s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0761s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5059s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6664s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8394s for 90112 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5157s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6736s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8421s for 90112 events => throughput is 1.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.092440e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092654e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.094603e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096977e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,8 +209,8 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3934s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3663s + [COUNTERS] PROGRAM TOTAL : 0.3952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3682s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 8192 events => throughput is 3.03E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9232s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6228s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3004s for 90112 events => throughput is 3.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9207s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6222s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2985s for 90112 events => throughput is 3.02E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.039638e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.030351e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.034015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.053000e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3713s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3577s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3664s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3526s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.92E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7502s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6011s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1490s for 90112 events => throughput is 6.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7549s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6032s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1517s for 90112 events => throughput is 5.94E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.923571e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.050114e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.853443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.136523e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3514s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3671s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.56E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7444s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6060s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1385s for 90112 events => throughput is 6.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7428s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6036s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1392s for 90112 events => throughput is 6.47E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.334346e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.414981e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.476144e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.641906e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -438,9 +438,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3762s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3583s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0178s for 8192 events => throughput is 4.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3768s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3587s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0181s for 8192 events => throughput is 4.52E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -460,7 +460,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -472,9 +472,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8096s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6129s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1967s for 90112 events => throughput is 4.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8044s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6082s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1962s for 90112 events => throughput is 4.59E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -487,12 +487,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.750606e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.700097e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.766894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.791964e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -504,7 +504,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -515,9 +515,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7793s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7788s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7736s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7730s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.53E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -537,7 +537,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -548,9 +548,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0413s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0348s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0277s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0213s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -563,42 +563,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.566939e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.752984e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.326602e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.407709e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.593547e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.072830e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.720103e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.713385e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.619232e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.043933e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.806222e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.812134e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.144615e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.573075e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.016256e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.978342e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 23f8d1233a..077dc6a885 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,22 +1,22 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_01:43:25 +DATE: 2024-06-02_22:19:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,8 +58,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4870s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4120s + [COUNTERS] PROGRAM TOTAL : 0.4856s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4106s [COUNTERS] Fortran MEs ( 1 ) : 0.0750s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4171s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3424s - [COUNTERS] Fortran MEs ( 1 ) : 0.0747s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4160s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3409s + [COUNTERS] Fortran MEs ( 1 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4382s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6232s - [COUNTERS] Fortran MEs ( 1 ) : 0.8150s for 90112 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4475s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6257s + [COUNTERS] Fortran MEs ( 1 ) : 0.8218s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.5038s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4232s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0806s for 8192 events => throughput is 1.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5015s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0809s for 8192 events => throughput is 1.01E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5722s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6823s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8899s for 90112 events => throughput is 1.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5666s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6744s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8923s for 90112 events => throughput is 1.01E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.030982e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.031508e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.031715e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.030453e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3811s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.1135s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6453s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4682s for 90112 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1044s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6391s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4653s for 90112 events => throughput is 1.94E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.925046e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916584e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.926756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929453e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -286,8 +286,8 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) [COUNTERS] PROGRAM TOTAL : 0.3913s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3660s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3659s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0254s for 8192 events => throughput is 3.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8962s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2769s for 90112 events => throughput is 3.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8986s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6180s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2806s for 90112 events => throughput is 3.21E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.266154e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276659e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.269502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.243257e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3831s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3610s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3866s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3640s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.64E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8418s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5992s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2426s for 90112 events => throughput is 3.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8612s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2454s for 90112 events => throughput is 3.67E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.789065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.762930e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.854777e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.712469e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,10 +426,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 @@ -438,9 +437,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4096s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3737s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0358s for 8192 events => throughput is 2.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3795s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0362s for 8192 events => throughput is 2.26E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -460,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -471,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0275s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6170s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4105s for 90112 events => throughput is 2.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0311s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4005s for 90112 events => throughput is 2.25E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.335204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283401e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.335548e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.295262e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -503,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -514,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7731s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7724s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7745s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7738s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -536,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -547,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0091s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0010s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 90112 events => throughput is 1.12E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0305s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0225s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0080s for 90112 events => throughput is 1.12E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.632055e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.542291e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.046951e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.061498e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.534300e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.547161e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.533151e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.524448e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.529948e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.546105e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.832656e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.795446e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.532773e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.543201e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.792603e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.784959e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 7b51bb9221..26ba9b7ba1 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -2,8 +2,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/h make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_05:58:47 +DATE: 2024-06-03_02:38:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9450s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8961s - [COUNTERS] Fortran MEs ( 1 ) : 0.0489s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9553s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9057s + [COUNTERS] Fortran MEs ( 1 ) : 0.0496s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4283s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3794s - [COUNTERS] Fortran MEs ( 1 ) : 0.0489s for 8192 events => throughput is 1.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3760s + [COUNTERS] Fortran MEs ( 1 ) : 0.0496s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp/avalassi/output_heftggbb_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8593s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3225s - [COUNTERS] Fortran MEs ( 1 ) : 0.5368s for 90112 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8801s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3361s + [COUNTERS] Fortran MEs ( 1 ) : 0.5440s for 90112 events => throughput is 1.66E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256148] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4564s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4617s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4178s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8316s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4781s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8500s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3666s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4834s for 90112 events => throughput is 1.86E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.935091e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.915502e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.955688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.921168e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4208s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3966s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 8192 events => throughput is 3.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4235s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3989s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.6048s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3350s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2698s for 90112 events => throughput is 3.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2758s for 90112 events => throughput is 3.27E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.396840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.381075e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.435183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.398450e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4008s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3863s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4040s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3892s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.5134s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3430s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1705s for 90112 events => throughput is 5.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5233s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3587s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1646s for 90112 events => throughput is 5.48E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.293625e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.463798e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.214602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.554107e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3999s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3865s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0134s for 8192 events => throughput is 6.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4072s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3936s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.04E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.4652s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1469s for 90112 events => throughput is 6.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4890s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3389s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1501s for 90112 events => throughput is 6.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.964595e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.050885e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.873725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.130883e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4159s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0218s for 8192 events => throughput is 3.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4200s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3973s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.60E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.5767s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3320s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2446s for 90112 events => throughput is 3.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6056s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3528s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2528s for 90112 events => throughput is 3.56E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.692127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666281e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.804832e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.687428e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256165] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8061s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.36E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8148s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8142s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.40E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377573] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.7545s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7475s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7684s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7616s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 90112 events => throughput is 1.32E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.804334e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.987095e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.230995e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.307521e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.136468e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.131525e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.810257e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.834387e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.115441e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.134822e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.048562e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.047145e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.115501e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.124622e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.748994e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.749909e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index d09b81d7d3..c173d3145a 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -make USEBUILDDIR=1 BACKEND=cuda - +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_05:59:14 +DATE: 2024-06-03_02:38:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9433s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8939s - [COUNTERS] Fortran MEs ( 1 ) : 0.0495s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9566s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9069s + [COUNTERS] Fortran MEs ( 1 ) : 0.0498s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3772s - [COUNTERS] Fortran MEs ( 1 ) : 0.0484s for 8192 events => throughput is 1.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4239s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3743s + [COUNTERS] Fortran MEs ( 1 ) : 0.0496s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp/avalassi/output_heftggbb_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8568s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3199s - [COUNTERS] Fortran MEs ( 1 ) : 0.5369s for 90112 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8822s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3377s + [COUNTERS] Fortran MEs ( 1 ) : 0.5445s for 90112 events => throughput is 1.65E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162897371946169] fbridge_mode=1 [UNWEIGHT] Wrote 1620 events (found 1625 events) - [COUNTERS] PROGRAM TOTAL : 0.4525s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4113s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4570s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4150s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0420s for 8192 events => throughput is 1.95E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 291c38991b..bf03415f4c 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -3,8 +3,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/h make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_05:59:20 +DATE: 2024-06-03_02:38:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9570s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9073s - [COUNTERS] Fortran MEs ( 1 ) : 0.0498s for 8192 events => throughput is 1.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9525s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9028s + [COUNTERS] Fortran MEs ( 1 ) : 0.0497s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4201s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3715s - [COUNTERS] Fortran MEs ( 1 ) : 0.0486s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4237s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3740s + [COUNTERS] Fortran MEs ( 1 ) : 0.0497s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp/avalassi/output_heftggbb_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8553s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3195s - [COUNTERS] Fortran MEs ( 1 ) : 0.5358s for 90112 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8805s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3361s + [COUNTERS] Fortran MEs ( 1 ) : 0.5444s for 90112 events => throughput is 1.66E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -134,9 +134,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955975930954] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4604s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4161s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4637s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4196s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -156,7 +156,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -168,9 +168,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895706383660] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8327s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4793s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8548s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3687s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4861s for 90112 events => throughput is 1.85E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -184,13 +184,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.817766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.787574e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.799752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.794973e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -214,9 +214,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955975930958] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3979s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4230s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3983s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 8192 events => throughput is 3.31E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -236,7 +236,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -248,9 +248,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895706383669] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.6020s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3321s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2699s for 90112 events => throughput is 3.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6212s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3479s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2733s for 90112 events => throughput is 3.30E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -264,13 +264,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.208317e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.207786e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.242147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.237136e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -282,7 +282,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -294,9 +294,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4018s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3865s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4046s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3894s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0152s for 8192 events => throughput is 5.39E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -316,7 +316,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -328,9 +328,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.4998s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3319s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1679s for 90112 events => throughput is 5.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3424s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1708s for 90112 events => throughput is 5.28E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -344,13 +344,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.916130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.675666e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.911254e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.789339e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,7 +362,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -374,9 +374,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4012s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4077s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3930s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.56E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -396,7 +396,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -408,9 +408,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.4747s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3226s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1520s for 90112 events => throughput is 5.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4951s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3399s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1552s for 90112 events => throughput is 5.81E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -424,13 +424,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.307270e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.160618e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.162265e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.006884e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -442,7 +442,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -454,9 +454,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4179s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3948s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 8192 events => throughput is 3.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4228s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3993s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -476,7 +476,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW @@ -488,9 +488,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.5949s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3393s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2556s for 90112 events => throughput is 3.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6127s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3526s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2600s for 90112 events => throughput is 3.47E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -504,13 +504,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.332654e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.310020e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.349365e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332375e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -522,7 +522,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -533,9 +533,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955503257827] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8124s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8118s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8117s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.44E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -555,7 +555,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -566,8 +566,8 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895242795732] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.7514s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7444s + [COUNTERS] PROGRAM TOTAL : 1.7688s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7618s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -581,42 +581,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.815756e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.921899e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.247313e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.241102e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.109600e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.122777e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.657132e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.754404e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.111219e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.139288e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.039865e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.037609e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.111200e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.121155e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.771983e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.701139e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 80269e77b1..fa651276a5 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,30 +1,30 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_06:00:18 +DATE: 2024-06-03_02:39:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 2.6807s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3392s - [COUNTERS] Fortran MEs ( 1 ) : 2.3415s for 8192 events => throughput is 3.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7116s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3448s + [COUNTERS] Fortran MEs ( 1 ) : 2.3668s for 8192 events => throughput is 3.46E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.6731s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3387s - [COUNTERS] Fortran MEs ( 1 ) : 2.3344s for 8192 events => throughput is 3.51E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7131s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s + [COUNTERS] Fortran MEs ( 1 ) : 2.3711s for 8192 events => throughput is 3.45E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /tmp/avalassi/output_smeftggtttt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > / [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 27.7347s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8315s - [COUNTERS] Fortran MEs ( 1 ) : 25.9031s for 90112 events => throughput is 3.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.8163s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8317s + [COUNTERS] Fortran MEs ( 1 ) : 25.9846s for 90112 events => throughput is 3.47E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 5.3198s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7672s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5526s for 8192 events => throughput is 3.21E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.3378s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7898s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5480s for 8192 events => throughput is 3.22E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438187E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 31.9550s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2173s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.7377s for 90112 events => throughput is 3.25E+03 events/s + [COUNTERS] PROGRAM TOTAL : 32.4420s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2681s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.1739s for 90112 events => throughput is 3.20E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.410579e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.365920e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.412797e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.364548e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084412E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.9225s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6149s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3076s for 8192 events => throughput is 6.27E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.9775s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6436s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3339s for 8192 events => throughput is 6.14E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 17.4719s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0716s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.4004s for 90112 events => throughput is 6.26E+03 events/s + [COUNTERS] PROGRAM TOTAL : 17.7362s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1158s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.6204s for 90112 events => throughput is 6.16E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.486190e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.420385e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.480385e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.404299e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.4971s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9161s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5810s for 8192 events => throughput is 1.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.5181s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5898s for 8192 events => throughput is 1.39E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 8.7948s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3698s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.4250s for 90112 events => throughput is 1.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.8864s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3960s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4904s for 90112 events => throughput is 1.39E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.446474e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.431830e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.453779e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.434276e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.3443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8339s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5104s for 8192 events => throughput is 1.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3701s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8508s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5193s for 8192 events => throughput is 1.58E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 8.0840s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3189s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.7651s for 90112 events => throughput is 1.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.0478s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3265s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7213s for 90112 events => throughput is 1.58E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.657111e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635240e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.653778e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.638985e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.7114s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6926s for 8192 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7295s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0334s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6961s for 8192 events => throughput is 1.18E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 10.1275s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.5992s for 90112 events => throughput is 1.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 10.1646s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5113s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.6534s for 90112 events => throughput is 1.18E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225114e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195609e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.220944e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192816e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8165s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.79E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,8 +546,8 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.4753s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2868s + [COUNTERS] PROGRAM TOTAL : 2.4746s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2861s [COUNTERS] CudaCpp MEs ( 2 ) : 0.1885s for 90112 events => throughput is 4.78E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.843300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.844103e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.218089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.234617e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.155842e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.148821e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.421655e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.426966e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.190444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.160825e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.415946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.418503e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.148629e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.147643e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.764175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.759279e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 45b154f6da..82dd2d8d1d 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_06:02:57 +DATE: 2024-06-03_02:42:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 2.6694s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3400s - [COUNTERS] Fortran MEs ( 1 ) : 2.3294s for 8192 events => throughput is 3.52E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7113s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3443s + [COUNTERS] Fortran MEs ( 1 ) : 2.3670s for 8192 events => throughput is 3.46E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.6748s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3390s - [COUNTERS] Fortran MEs ( 1 ) : 2.3358s for 8192 events => throughput is 3.51E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7137s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3424s + [COUNTERS] Fortran MEs ( 1 ) : 2.3712s for 8192 events => throughput is 3.45E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /tmp/avalassi/output_smeftggtttt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > / [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 27.4771s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8059s - [COUNTERS] Fortran MEs ( 1 ) : 25.6712s for 90112 events => throughput is 3.51E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.8698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8354s + [COUNTERS] Fortran MEs ( 1 ) : 26.0343s for 90112 events => throughput is 3.46E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896785213255034E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 5.0871s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6795s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4076s for 8192 events => throughput is 3.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.1513s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7119s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4394s for 8192 events => throughput is 3.36E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668138359550833E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 30.6384s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1272s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.5112s for 90112 events => throughput is 3.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 31.0280s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1848s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.8433s for 90112 events => throughput is 3.36E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.518646e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.473180e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.519140e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.467630e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896766542858863E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.6928s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6797s for 8192 events => throughput is 1.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7129s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0222s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6907s for 8192 events => throughput is 1.19E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668121906848987E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 9.9205s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4645s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.4560s for 90112 events => throughput is 1.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 10.0727s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4928s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.5799s for 90112 events => throughput is 1.19E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232964e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.217938e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231409e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.215916e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.9294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2988s for 8192 events => throughput is 2.74E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9442s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6416s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3027s for 8192 events => throughput is 2.71E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 5.3691s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0900s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.2791s for 90112 events => throughput is 2.75E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1125s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3455s for 90112 events => throughput is 2.69E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.797581e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.798829e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.799365e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.800347e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8582s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5958s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2624s for 8192 events => throughput is 3.12E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8729s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6055s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2674s for 8192 events => throughput is 3.06E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 4.9443s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0456s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8987s for 90112 events => throughput is 3.11E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0205s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0739s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.9465s for 90112 events => throughput is 3.06E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.232867e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.168404e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.231460e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.175648e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896778056937195E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.0285s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6828s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0352s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3472s for 8192 events => throughput is 2.36E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668139178203571E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 5.9990s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1683s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8307s for 90112 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.9908s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1644s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8264s for 90112 events => throughput is 2.36E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.399591e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.392336e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392653e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.396345e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896805369365078E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0140s for 8192 events => throughput is 5.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8232s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8092s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0140s for 8192 events => throughput is 5.87E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668194616292154E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.4309s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2769s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1540s for 90112 events => throughput is 5.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4312s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1539s for 90112 events => throughput is 5.85E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.229528e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.221227e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.512458e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502146e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.376887e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.377943e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.385341e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.384704e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.353397e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.364731e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.396782e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409606e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.372639e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.373672e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.860961e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.829768e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 66daeb0e97..d5f006f577 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_06:05:05 +DATE: 2024-06-03_02:44:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 2.7059s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3465s - [COUNTERS] Fortran MEs ( 1 ) : 2.3593s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7088s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3456s + [COUNTERS] Fortran MEs ( 1 ) : 2.3632s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.7035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3438s - [COUNTERS] Fortran MEs ( 1 ) : 2.3597s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6651s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3397s + [COUNTERS] Fortran MEs ( 1 ) : 2.3253s for 8192 events => throughput is 3.52E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /tmp/avalassi/output_smeftggtttt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > / [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 27.5770s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8193s - [COUNTERS] Fortran MEs ( 1 ) : 25.7577s for 90112 events => throughput is 3.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.5341s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8166s + [COUNTERS] Fortran MEs ( 1 ) : 25.7175s for 90112 events => throughput is 3.50E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696375074447E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 5.3005s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7718s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5286s for 8192 events => throughput is 3.24E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.3722s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8085s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5637s for 8192 events => throughput is 3.20E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -155,7 +155,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668081976882373E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 31.9857s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2191s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.7666s for 90112 events => throughput is 3.25E+03 events/s + [COUNTERS] PROGRAM TOTAL : 32.5074s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2758s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2316s for 90112 events => throughput is 3.19E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386546e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.343554e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.397002e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.344274e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +198,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696285825688E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.8744s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5934s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2810s for 8192 events => throughput is 6.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.9360s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6221s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3139s for 8192 events => throughput is 6.23E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -231,7 +231,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668081890954375E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 17.1795s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0414s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.1381s for 90112 events => throughput is 6.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 17.4852s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0946s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.3906s for 90112 events => throughput is 6.26E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.678262e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.609788e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.743588e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.608079e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -274,7 +274,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.4779s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5764s for 8192 events => throughput is 1.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.5027s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9146s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5881s for 8192 events => throughput is 1.39E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -307,7 +307,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 8.7035s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3540s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.3495s for 90112 events => throughput is 1.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.8508s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3935s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4573s for 90112 events => throughput is 1.40E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.454724e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.448003e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.476512e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.440825e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -350,7 +350,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.3378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8314s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5064s for 8192 events => throughput is 1.62E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8537s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5151s for 8192 events => throughput is 1.59E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -383,7 +383,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 7.8877s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3042s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.5835s for 90112 events => throughput is 1.61E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.9986s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3246s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.6740s for 90112 events => throughput is 1.59E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.679497e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.644369e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.670221e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.646050e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -426,7 +426,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.7201s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0244s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6957s for 8192 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7590s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0494s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7096s for 8192 events => throughput is 1.15E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -459,7 +459,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 10.1095s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4855s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.6240s for 90112 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 10.3214s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5233s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.7981s for 90112 events => throughput is 1.16E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.204025e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172848e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.203510e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.173704e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -502,7 +502,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697918297644E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8192s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8149s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.78E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -535,7 +535,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551547592E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.4662s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2768s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1894s for 90112 events => throughput is 4.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4765s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2872s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1893s for 90112 events => throughput is 4.76E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.814879e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.819015e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.185918e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.183583e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.154361e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.112239e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.382253e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.382293e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.160102e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.110807e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.387193e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.384672e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.108981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.154224e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.750323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.747274e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 059122dda6..b228907f76 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_06:00:05 +DATE: 2024-06-03_02:39:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -47,20 +47,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 - [UNWEIGHT] Wrote 685 events (found 2208 events) - [COUNTERS] PROGRAM TOTAL : 0.4148s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4054s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.76E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 + [UNWEIGHT] Wrote 1767 events (found 4306 events) + [COUNTERS] PROGRAM TOTAL : 0.6671s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6577s + [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,20 +72,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 - [UNWEIGHT] Wrote 648 events (found 1275 events) - [COUNTERS] PROGRAM TOTAL : 0.3154s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3059s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.68E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 + [UNWEIGHT] Wrote 1636 events (found 1641 events) + [COUNTERS] PROGRAM TOTAL : 0.3925s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3832s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.81E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,20 +97,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /tmp/avalassi/output_susyggt1t1_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4762 [0.47620722822826000] fbridge_mode=0 - [UNWEIGHT] Wrote 1784 events (found 1789 events) - [COUNTERS] PROGRAM TOTAL : 1.3773s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2771s - [COUNTERS] Fortran MEs ( 1 ) : 0.1002s for 90112 events => throughput is 8.99E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 + [UNWEIGHT] Wrote 1828 events (found 1833 events) + [COUNTERS] PROGRAM TOTAL : 1.4414s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3401s + [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,15 +122,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 [XSECTION] ERROR! No cross section in log file: /tmp/avalassi/output_susyggt1t1_x1_cudacpp ... @@ -139,7 +139,7 @@ xqcutij # 3> 0.0 0.0 NGOODHEL = 4 NCOMB = 4 MULTI_CHANNEL = TRUE - CHANNEL_ID = 2 + CHANNEL_ID = 3 RESET CUMULATIVE VARIABLE 4096 points passed the cut but all returned zero therefore considering this contribution as zero diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 01167da954..7dc3f4f16e 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - - make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_06:00:09 +DATE: 2024-06-03_02:39:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -47,20 +47,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 - [UNWEIGHT] Wrote 685 events (found 2208 events) - [COUNTERS] PROGRAM TOTAL : 0.4095s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4004s - [COUNTERS] Fortran MEs ( 1 ) : 0.0092s for 8192 events => throughput is 8.95E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 + [UNWEIGHT] Wrote 1767 events (found 4306 events) + [COUNTERS] PROGRAM TOTAL : 0.6704s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6610s + [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,20 +72,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 - [UNWEIGHT] Wrote 648 events (found 1275 events) - [COUNTERS] PROGRAM TOTAL : 0.3199s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3104s - [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.61E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 + [UNWEIGHT] Wrote 1636 events (found 1641 events) + [COUNTERS] PROGRAM TOTAL : 0.3924s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,20 +97,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /tmp/avalassi/output_susyggt1t1_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4762 [0.47620722822826000] fbridge_mode=0 - [UNWEIGHT] Wrote 1784 events (found 1789 events) - [COUNTERS] PROGRAM TOTAL : 1.3992s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2973s - [COUNTERS] Fortran MEs ( 1 ) : 0.1018s for 90112 events => throughput is 8.85E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 + [UNWEIGHT] Wrote 1828 events (found 1833 events) + [COUNTERS] PROGRAM TOTAL : 1.4420s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3407s + [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,15 +122,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 [XSECTION] ERROR! No cross section in log file: /tmp/avalassi/output_susyggt1t1_x1_cudacpp ... @@ -139,7 +139,7 @@ xqcutij # 3> 0.0 0.0 NGOODHEL = 4 NCOMB = 4 MULTI_CHANNEL = TRUE - CHANNEL_ID = 2 + CHANNEL_ID = 3 RESET CUMULATIVE VARIABLE 4096 points passed the cut but all returned zero therefore considering this contribution as zero diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 6c876298cd..89c65c015c 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -2,12 +2,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/s make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' @@ -18,9 +18,9 @@ make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_06:00:14 +DATE: 2024-06-03_02:39:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -47,20 +47,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 - [UNWEIGHT] Wrote 685 events (found 2208 events) - [COUNTERS] PROGRAM TOTAL : 0.4113s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4020s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.76E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 + [UNWEIGHT] Wrote 1767 events (found 4306 events) + [COUNTERS] PROGRAM TOTAL : 0.6710s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6616s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,20 +72,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 - [UNWEIGHT] Wrote 648 events (found 1275 events) - [COUNTERS] PROGRAM TOTAL : 0.3138s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3043s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.67E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 + [UNWEIGHT] Wrote 1636 events (found 1641 events) + [COUNTERS] PROGRAM TOTAL : 0.3927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,20 +97,20 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /tmp/avalassi/output_susyggt1t1_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4762 [0.47620722822826000] fbridge_mode=0 - [UNWEIGHT] Wrote 1784 events (found 1789 events) - [COUNTERS] PROGRAM TOTAL : 1.3758s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2758s - [COUNTERS] Fortran MEs ( 1 ) : 0.1001s for 90112 events => throughput is 9.01E+05 events/s + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 + [UNWEIGHT] Wrote 1828 events (found 1833 events) + [COUNTERS] PROGRAM TOTAL : 1.4406s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3394s + [COUNTERS] Fortran MEs ( 1 ) : 0.1012s for 90112 events => throughput is 8.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,15 +122,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 [XSECTION] ERROR! No cross section in log file: /tmp/avalassi/output_susyggt1t1_x1_cudacpp ... @@ -139,7 +139,7 @@ xqcutij # 3> 0.0 0.0 NGOODHEL = 4 NCOMB = 4 MULTI_CHANNEL = TRUE - CHANNEL_ID = 2 + CHANNEL_ID = 3 RESET CUMULATIVE VARIABLE 4096 points passed the cut but all returned zero therefore considering this contribution as zero diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index fd24a61552..c3b7dfd598 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx - make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_05:59:47 +DATE: 2024-06-03_02:39:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.8237s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7798s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8317s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7874s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4191s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3754s - [COUNTERS] Fortran MEs ( 1 ) : 0.0437s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4203s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3760s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp/avalassi/output_susyggtt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.8130s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3278s - [COUNTERS] Fortran MEs ( 1 ) : 0.4852s for 90112 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8213s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3345s + [COUNTERS] Fortran MEs ( 1 ) : 0.4868s for 90112 events => throughput is 1.85E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 171.8 [171.81273026311101] fbridge_mode=1 [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.7007s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6611s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 8192 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7094s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6690s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0404s for 8192 events => throughput is 2.03E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 293718b73f..72c1ceb733 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -2,8 +2,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/s make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_05:59:53 +DATE: 2024-06-03_02:39:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.8342s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7896s - [COUNTERS] Fortran MEs ( 1 ) : 0.0446s for 8192 events => throughput is 1.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8348s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7904s + [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4178s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3733s - [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4224s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3780s + [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp/avalassi/output_susyggtt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.8125s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3299s - [COUNTERS] Fortran MEs ( 1 ) : 0.4826s for 90112 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8224s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3351s + [COUNTERS] Fortran MEs ( 1 ) : 0.4873s for 90112 events => throughput is 1.85E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 171.8 [171.81270286137041] fbridge_mode=1 [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.7025s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6657s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0367s for 8192 events => throughput is 2.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7073s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6699s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0374s for 8192 events => throughput is 2.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index f9ac9cdc3d..21fd4d6bec 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,13 +1,13 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' @@ -18,8 +18,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-05-16_05:59:59 +DATE: 2024-06-03_02:39:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -47,7 +47,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.8401s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7957s - [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8339s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7896s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -72,7 +72,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3820s - [COUNTERS] Fortran MEs ( 1 ) : 0.0448s for 8192 events => throughput is 1.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3787s + [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -97,7 +97,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp/avalassi/output_susyggtt_x10_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.8476s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3594s - [COUNTERS] Fortran MEs ( 1 ) : 0.4881s for 90112 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8213s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3354s + [COUNTERS] Fortran MEs ( 1 ) : 0.4859s for 90112 events => throughput is 1.85E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -122,7 +122,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) 1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) 0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' [OPENMPTH] omp_get_max_threads/nproc = 1/4 @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 171.8 [171.81273490068889] fbridge_mode=1 [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.7032s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6624s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7067s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6664s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0403s for 8192 events => throughput is 2.03E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** From 2f32ffdfc668b57930034a67cf83b8347cc3965d Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 08:31:38 +0200 Subject: [PATCH 19/33] [tmad] in gg_ttgg.mad and susy_gg_t1t1.mad, temporarely go back to code with no volatile, to rerun tmad and expose SIGFPE #855 git checkout upstream/master susy_gg_t1t1.mad gg_ttgg.mad --- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 24 +++++++++---------- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 20 +++++++--------- .../Source/DHELAS/aloha_functions.f | 2 +- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index e2c6e055a3..d479b476a0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005956411361694336  +DEBUG: model prefixing takes 0.00582575798034668  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.171 s +1 processes with 123 diagrams generated in 0.169 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.456 s -Wrote files for 222 helas calls in 0.744 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.460 s +Wrote files for 222 helas calls in 0.741 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.354 s +ALOHA: aloha creates 5 routines in 0.355 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.334 s +ALOHA: aloha creates 10 routines in 0.338 s VVV1 VVV1 FFV1 @@ -235,10 +235,8 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ -INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -259,10 +257,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m4.109s -user 0m3.737s -sys 0m0.281s -Code generation completed in 4 seconds +real 0m3.505s +user 0m3.236s +sys 0m0.224s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..657387a586 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 16c822599f..78d37d6c49 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.132 s +1 processes with 6 diagrams generated in 0.130 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,19 +592,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s Wrote files for 16 helas calls in 0.117 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.198 s +ALOHA: aloha creates 3 routines in 0.196 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.194 s +ALOHA: aloha creates 6 routines in 0.201 s VVV1 VSS1 VSS1 @@ -623,10 +623,8 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ -INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -647,10 +645,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.624s -user 0m2.849s -sys 0m0.311s -Code generation completed in 4 seconds +real 0m3.081s +user 0m2.681s +sys 0m0.245s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..657387a586 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) From 4fa179068d4151e17edb477e0757fc7201135e6f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 08:39:31 +0200 Subject: [PATCH 20/33] [tmad] temporarely rerun tmad tests for ggttgg and susyggt1t1 to expose SIGFPE #855 - will revert ./tmad/teeMadX.sh -mix -makeclean +10x -ggttgg -susyggt1t1 --- .../log_ggttgg_mad_d_inl0_hrd0.txt | 96 +++++++++---------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 94 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 96 +++++++++---------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 76 ++++++++------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 70 ++++++++------ .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 72 ++++++++------ 6 files changed, 258 insertions(+), 246 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index c981799588..e2636f548b 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -3,18 +3,18 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z + +make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:14:52 +DATE: 2024-06-03_08:34:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.6980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2611s - [COUNTERS] Fortran MEs ( 1 ) : 4.4369s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7296s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2660s + [COUNTERS] Fortran MEs ( 1 ) : 4.4636s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.6906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s - [COUNTERS] Fortran MEs ( 1 ) : 4.4335s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2574s + [COUNTERS] Fortran MEs ( 1 ) : 4.4490s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.6666s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8936s - [COUNTERS] Fortran MEs ( 1 ) : 48.7731s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.7925s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9034s + [COUNTERS] Fortran MEs ( 1 ) : 48.8892s for 90112 events => throughput is 1.84E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,42 +125,34 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.4632 [0.46320556621222236] fbridge_mode=1 - [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 9.2716s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7071s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5645s for 8192 events => throughput is 1.79E+03 events/s - -*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.46320556621222242) and cpp (0.46320556621222236) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 -6,8c6,8 -< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. -< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. -< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. ---- -> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. -> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. -> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. -20c20 -< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. ---- -> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. -22c22 -< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. ---- -> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. -24c24 -< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. ---- -> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. + +Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. + +Backtrace for this error: +#0 0x7f10e1423860 in ??? +#1 0x7f10e1422a05 in ??? +#2 0x7f10e1054def in ??? +#3 0x44b5ff in ??? +#4 0x4087df in ??? +#5 0x409848 in ??? +#6 0x40bb83 in ??? +#7 0x40d1a9 in ??? +#8 0x45c804 in ??? +#9 0x434269 in ??? +#10 0x40371e in ??? +#11 0x7f10e103feaf in ??? +#12 0x7f10e103ff5f in ??? +#13 0x403844 in ??? +#14 0xffffffffffffffff in ??? +./madX.sh: line 388: 1458243 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} +ERROR! ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' failed + Running at P P machine @ 13000.000000000000 GeV + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis + + + getting user params +Enter number of events and max and min iterations: diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index e5afb01bb6..73e66f25d8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -2,18 +2,18 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:16:03 +DATE: 2024-06-03_08:35:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.6224s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s - [COUNTERS] Fortran MEs ( 1 ) : 4.3645s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7006s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2599s + [COUNTERS] Fortran MEs ( 1 ) : 4.4406s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.6165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2540s - [COUNTERS] Fortran MEs ( 1 ) : 4.3625s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7005s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2571s + [COUNTERS] Fortran MEs ( 1 ) : 4.4435s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.5574s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8890s - [COUNTERS] Fortran MEs ( 1 ) : 48.6684s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.7843s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8992s + [COUNTERS] Fortran MEs ( 1 ) : 48.8851s for 90112 events => throughput is 1.84E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,42 +125,34 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.4632 [0.46320716609204404] fbridge_mode=1 - [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 8.9166s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5182s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3984s for 8192 events => throughput is 1.86E+03 events/s - -*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.46320556621222242) and cpp (0.46320716609204404) differ by less than 4E-4 (3.453930475627587e-06) - -*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 -6,8c6,8 -< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. -< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. -< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. ---- -> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. -> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. -> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. -20c20 -< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. ---- -> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. -22c22 -< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. ---- -> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. -24c24 -< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. ---- -> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. + +Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. + +Backtrace for this error: +#0 0x7fe2e1a23860 in ??? +#1 0x7fe2e1a22a05 in ??? +#2 0x7fe2e1654def in ??? +#3 0x44b5ff in ??? +#4 0x4087df in ??? +#5 0x409848 in ??? +#6 0x40bb83 in ??? +#7 0x40d1a9 in ??? +#8 0x45c804 in ??? +#9 0x434269 in ??? +#10 0x40371e in ??? +#11 0x7fe2e163feaf in ??? +#12 0x7fe2e163ff5f in ??? +#13 0x403844 in ??? +#14 0xffffffffffffffff in ??? +./madX.sh: line 388: 1458726 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} +ERROR! ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' failed + Running at P P machine @ 13000.000000000000 GeV + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis + + + getting user params +Enter number of events and max and min iterations: diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 05784aaa7b..32ea7f64a4 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -3,17 +3,17 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:17:14 +DATE: 2024-06-03_08:36:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.6921s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2633s - [COUNTERS] Fortran MEs ( 1 ) : 4.4288s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2613s + [COUNTERS] Fortran MEs ( 1 ) : 4.4454s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.6873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2561s - [COUNTERS] Fortran MEs ( 1 ) : 4.4311s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7028s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s + [COUNTERS] Fortran MEs ( 1 ) : 4.4449s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.6939s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8954s - [COUNTERS] Fortran MEs ( 1 ) : 48.7985s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.7520s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8982s + [COUNTERS] Fortran MEs ( 1 ) : 48.8538s for 90112 events => throughput is 1.84E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,42 +125,34 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.4632 [0.46320556893412546] fbridge_mode=1 - [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 9.3537s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7437s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6100s for 8192 events => throughput is 1.78E+03 events/s - -*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.46320556621222242) and cpp (0.46320556893412546) differ by less than 2E-4 (5.876231279344779e-09) - -*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 -6,8c6,8 -< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. -< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. -< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. ---- -> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. -> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. -> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. -20c20 -< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. ---- -> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. -22c22 -< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. ---- -> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. -24c24 -< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. ---- -> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. + +Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. + +Backtrace for this error: +#0 0x7f191ce23860 in ??? +#1 0x7f191ce22a05 in ??? +#2 0x7f191ca54def in ??? +#3 0x44b5ff in ??? +#4 0x4087df in ??? +#5 0x409848 in ??? +#6 0x40bb83 in ??? +#7 0x40d1a9 in ??? +#8 0x45c804 in ??? +#9 0x434269 in ??? +#10 0x40371e in ??? +#11 0x7f191ca3feaf in ??? +#12 0x7f191ca3ff5f in ??? +#13 0x403844 in ??? +#14 0xffffffffffffffff in ??? +./madX.sh: line 388: 1459209 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} +ERROR! ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' failed + Running at P P machine @ 13000.000000000000 GeV + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis + + + getting user params +Enter number of events and max and min iterations: diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index b228907f76..22c2975bdd 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,13 +1,13 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone + + make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:35 +DATE: 2024-06-03_08:37:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6577s - [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6658s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6564s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.71E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3832s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3931s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3835s + [COUNTERS] Fortran MEs ( 1 ) : 0.0096s for 8192 events => throughput is 8.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4414s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3401s - [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4439s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3427s + [COUNTERS] Fortran MEs ( 1 ) : 0.1012s for 90112 events => throughput is 8.91E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,34 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] ERROR! No cross section in log file: - /tmp/avalassi/output_susyggt1t1_x1_cudacpp - ... -xqcutij # 3> 0.0 0.0 - RESET CUMULATIVE VARIABLE - NGOODHEL = 4 - NCOMB = 4 - MULTI_CHANNEL = TRUE - CHANNEL_ID = 3 - RESET CUMULATIVE VARIABLE - 4096 points passed the cut but all returned zero - therefore considering this contribution as zero - Deleting file events.lhe + +Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. + +Backtrace for this error: +#0 0x7fee2dc23860 in ??? +#1 0x7fee2dc22a05 in ??? +#2 0x7fee2d854def in ??? +#3 0x43809f in ??? +#4 0x40581f in ??? +#5 0x4067b1 in ??? +#6 0x408c71 in ??? +#7 0x40a0a9 in ??? +#8 0x444fdf in ??? +#9 0x42bb38 in ??? +#10 0x40371e in ??? +#11 0x7fee2d83feaf in ??? +#12 0x7fee2d83ff5f in ??? +#13 0x403844 in ??? +#14 0xffffffffffffffff in ??? +./madX.sh: line 388: 1459682 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} +ERROR! ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' failed + mdl_conjg__NN3x2 = (8.77004900000000059E-002,-0.0000000000000000) + mdl_conjg__NN3x3 = (0.69587750000000004,-0.0000000000000000) + mdl_conjg__NN3x4 = (0.71022700000000005,-0.0000000000000000) + mdl_conjg__NN4x1 = (-0.11650710000000000,-0.0000000000000000) + mdl_conjg__NN4x2 = (0.31073899999999999,-0.0000000000000000) + mdl_conjg__NN4x3 = (0.64922599999999997,-0.0000000000000000) + mdl_conjg__NN4x4 = (-0.68437780000000004,-0.0000000000000000) + mdl_conjg__UU1x1 = (0.91683490000000001,-0.0000000000000000) + mdl_conjg__UU1x2 = (-0.39926660000000003,-0.0000000000000000) + mdl_conjg__UU2x1 = (0.39926660000000003,-0.0000000000000000) diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 7dc3f4f16e..f86a3fb9ca 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:40 +DATE: 2024-06-03_08:37:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6610s - [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6651s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6557s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.78E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3932s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3839s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.75E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4420s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3407s - [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4413s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3404s + [COUNTERS] Fortran MEs ( 1 ) : 0.1009s for 90112 events => throughput is 8.93E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,34 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] ERROR! No cross section in log file: - /tmp/avalassi/output_susyggt1t1_x1_cudacpp - ... -xqcutij # 3> 0.0 0.0 - RESET CUMULATIVE VARIABLE - NGOODHEL = 4 - NCOMB = 4 - MULTI_CHANNEL = TRUE - CHANNEL_ID = 3 - RESET CUMULATIVE VARIABLE - 4096 points passed the cut but all returned zero - therefore considering this contribution as zero - Deleting file events.lhe + +Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. + +Backtrace for this error: +#0 0x7ffb61823860 in ??? +#1 0x7ffb61822a05 in ??? +#2 0x7ffb61454def in ??? +#3 0x43809f in ??? +#4 0x40581f in ??? +#5 0x4067b1 in ??? +#6 0x408c71 in ??? +#7 0x40a0a9 in ??? +#8 0x444fdf in ??? +#9 0x42bb38 in ??? +#10 0x40371e in ??? +#11 0x7ffb6143feaf in ??? +#12 0x7ffb6143ff5f in ??? +#13 0x403844 in ??? +#14 0xffffffffffffffff in ??? +./madX.sh: line 388: 1460160 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} +ERROR! ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' failed + mdl_conjg__NN3x2 = (8.77004900000000059E-002,-0.0000000000000000) + mdl_conjg__NN3x3 = (0.69587750000000004,-0.0000000000000000) + mdl_conjg__NN3x4 = (0.71022700000000005,-0.0000000000000000) + mdl_conjg__NN4x1 = (-0.11650710000000000,-0.0000000000000000) + mdl_conjg__NN4x2 = (0.31073899999999999,-0.0000000000000000) + mdl_conjg__NN4x3 = (0.64922599999999997,-0.0000000000000000) + mdl_conjg__NN4x4 = (-0.68437780000000004,-0.0000000000000000) + mdl_conjg__UU1x1 = (0.91683490000000001,-0.0000000000000000) + mdl_conjg__UU1x2 = (-0.39926660000000003,-0.0000000000000000) + mdl_conjg__UU2x1 = (0.39926660000000003,-0.0000000000000000) diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 89c65c015c..fdd705f7f1 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -13,13 +13,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:45 +DATE: 2024-06-03_08:37:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6710s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6616s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6646s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6551s + [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.59E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3927s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3943s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3850s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.79E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,8 +108,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4406s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3394s + [COUNTERS] PROGRAM TOTAL : 1.4422s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3410s [COUNTERS] Fortran MEs ( 1 ) : 0.1012s for 90112 events => throughput is 8.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** @@ -125,22 +125,34 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] ERROR! No cross section in log file: - /tmp/avalassi/output_susyggt1t1_x1_cudacpp - ... -xqcutij # 3> 0.0 0.0 - RESET CUMULATIVE VARIABLE - NGOODHEL = 4 - NCOMB = 4 - MULTI_CHANNEL = TRUE - CHANNEL_ID = 3 - RESET CUMULATIVE VARIABLE - 4096 points passed the cut but all returned zero - therefore considering this contribution as zero - Deleting file events.lhe + +Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. + +Backtrace for this error: +#0 0x7f789d023860 in ??? +#1 0x7f789d022a05 in ??? +#2 0x7f789cc54def in ??? +#3 0x43809f in ??? +#4 0x40581f in ??? +#5 0x4067b1 in ??? +#6 0x408c71 in ??? +#7 0x40a0a9 in ??? +#8 0x444fdf in ??? +#9 0x42bb38 in ??? +#10 0x40371e in ??? +#11 0x7f789cc3feaf in ??? +#12 0x7f789cc3ff5f in ??? +#13 0x403844 in ??? +#14 0xffffffffffffffff in ??? +./madX.sh: line 388: 1460638 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} +ERROR! ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' failed + mdl_conjg__NN3x2 = (8.77004900000000059E-002,-0.0000000000000000) + mdl_conjg__NN3x3 = (0.69587750000000004,-0.0000000000000000) + mdl_conjg__NN3x4 = (0.71022700000000005,-0.0000000000000000) + mdl_conjg__NN4x1 = (-0.11650710000000000,-0.0000000000000000) + mdl_conjg__NN4x2 = (0.31073899999999999,-0.0000000000000000) + mdl_conjg__NN4x3 = (0.64922599999999997,-0.0000000000000000) + mdl_conjg__NN4x4 = (-0.68437780000000004,-0.0000000000000000) + mdl_conjg__UU1x1 = (0.91683490000000001,-0.0000000000000000) + mdl_conjg__UU1x2 = (-0.39926660000000003,-0.0000000000000000) + mdl_conjg__UU2x1 = (0.39926660000000003,-0.0000000000000000) From 67cba70483231acbe314c5e07910c13dc6cadec0 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 08:40:11 +0200 Subject: [PATCH 21/33] [tmad] ** COMPLETE TMAD ** revert the two last temporary commits which confirmed that SIGFPE #855 was present and is now fixed Revert "[tmad] temporarely rerun tmad tests for ggttgg and susyggt1t1 to expose SIGFPE #855 - will revert" This reverts commit 4fa179068d4151e17edb477e0757fc7201135e6f. Revert "[tmad] in gg_ttgg.mad and susy_gg_t1t1.mad, temporarely go back to code with no volatile, to rerun tmad and expose SIGFPE #855" This reverts commit 2f32ffdfc668b57930034a67cf83b8347cc3965d. --- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 24 ++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 20 ++-- .../Source/DHELAS/aloha_functions.f | 2 +- .../log_ggttgg_mad_d_inl0_hrd0.txt | 96 ++++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 94 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 96 ++++++++++--------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 76 +++++++-------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 70 ++++++-------- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 72 ++++++-------- 10 files changed, 272 insertions(+), 280 deletions(-) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index d479b476a0..e2c6e055a3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00582575798034668  +DEBUG: model prefixing takes 0.005956411361694336  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.169 s +1 processes with 123 diagrams generated in 0.171 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.460 s -Wrote files for 222 helas calls in 0.741 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.456 s +Wrote files for 222 helas calls in 0.744 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.355 s +ALOHA: aloha creates 5 routines in 0.354 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.338 s +ALOHA: aloha creates 10 routines in 0.334 s VVV1 VVV1 FFV1 @@ -235,8 +235,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -257,10 +259,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.505s -user 0m3.236s -sys 0m0.224s -Code generation completed in 3 seconds +real 0m4.109s +user 0m3.737s +sys 0m0.281s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 78d37d6c49..16c822599f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.130 s +1 processes with 6 diagrams generated in 0.132 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,19 +592,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s Wrote files for 16 helas calls in 0.117 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.196 s +ALOHA: aloha creates 3 routines in 0.198 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.201 s +ALOHA: aloha creates 6 routines in 0.194 s VVV1 VSS1 VSS1 @@ -623,8 +623,10 @@ If you want to make this value the default for future session, you can run 'save save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ +INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -645,10 +647,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.081s -user 0m2.681s -sys 0m0.245s -Code generation completed in 3 seconds +real 0m3.624s +user 0m2.849s +sys 0m0.311s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f index 657387a586..d0ec1dbde9 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index e2636f548b..c981799588 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -3,18 +3,18 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z - -make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_08:34:03 +DATE: 2024-06-02_22:14:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.7296s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2660s - [COUNTERS] Fortran MEs ( 1 ) : 4.4636s for 8192 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6980s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2611s + [COUNTERS] Fortran MEs ( 1 ) : 4.4369s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.7064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2574s - [COUNTERS] Fortran MEs ( 1 ) : 4.4490s for 8192 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6906s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s + [COUNTERS] Fortran MEs ( 1 ) : 4.4335s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.7925s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9034s - [COUNTERS] Fortran MEs ( 1 ) : 48.8892s for 90112 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.6666s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8936s + [COUNTERS] Fortran MEs ( 1 ) : 48.7731s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,34 +125,42 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - -Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. - -Backtrace for this error: -#0 0x7f10e1423860 in ??? -#1 0x7f10e1422a05 in ??? -#2 0x7f10e1054def in ??? -#3 0x44b5ff in ??? -#4 0x4087df in ??? -#5 0x409848 in ??? -#6 0x40bb83 in ??? -#7 0x40d1a9 in ??? -#8 0x45c804 in ??? -#9 0x434269 in ??? -#10 0x40371e in ??? -#11 0x7f10e103feaf in ??? -#12 0x7f10e103ff5f in ??? -#13 0x403844 in ??? -#14 0xffffffffffffffff in ??? -./madX.sh: line 388: 1458243 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' failed - Running at P P machine @ 13000.000000000000 GeV - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis - - - getting user params -Enter number of events and max and min iterations: + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556621222236] fbridge_mode=1 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 9.2716s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7071s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.5645s for 8192 events => throughput is 1.79E+03 events/s + +*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.46320556621222242) and cpp (0.46320556621222236) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! +diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 +6,8c6,8 +< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +--- +> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +20c20 +< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +--- +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +22c22 +< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +--- +> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +24c24 +< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. +--- +> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 73e66f25d8..e5afb01bb6 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -2,18 +2,18 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 - make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_08:35:05 +DATE: 2024-06-02_22:16:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.7006s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2599s - [COUNTERS] Fortran MEs ( 1 ) : 4.4406s for 8192 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6224s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s + [COUNTERS] Fortran MEs ( 1 ) : 4.3645s for 8192 events => throughput is 1.88E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.7005s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2571s - [COUNTERS] Fortran MEs ( 1 ) : 4.4435s for 8192 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2540s + [COUNTERS] Fortran MEs ( 1 ) : 4.3625s for 8192 events => throughput is 1.88E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.7843s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8992s - [COUNTERS] Fortran MEs ( 1 ) : 48.8851s for 90112 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.5574s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8890s + [COUNTERS] Fortran MEs ( 1 ) : 48.6684s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,34 +125,42 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - -Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. - -Backtrace for this error: -#0 0x7fe2e1a23860 in ??? -#1 0x7fe2e1a22a05 in ??? -#2 0x7fe2e1654def in ??? -#3 0x44b5ff in ??? -#4 0x4087df in ??? -#5 0x409848 in ??? -#6 0x40bb83 in ??? -#7 0x40d1a9 in ??? -#8 0x45c804 in ??? -#9 0x434269 in ??? -#10 0x40371e in ??? -#11 0x7fe2e163feaf in ??? -#12 0x7fe2e163ff5f in ??? -#13 0x403844 in ??? -#14 0xffffffffffffffff in ??? -./madX.sh: line 388: 1458726 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' failed - Running at P P machine @ 13000.000000000000 GeV - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis - - - getting user params -Enter number of events and max and min iterations: + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320716609204404] fbridge_mode=1 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 8.9166s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5182s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3984s for 8192 events => throughput is 1.86E+03 events/s + +*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.46320556621222242) and cpp (0.46320716609204404) differ by less than 4E-4 (3.453930475627587e-06) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! +diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 +6,8c6,8 +< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +--- +> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +20c20 +< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +--- +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +22c22 +< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +--- +> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +24c24 +< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. +--- +> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 32ea7f64a4..05784aaa7b 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -3,17 +3,17 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_08:36:08 +DATE: 2024-06-02_22:17:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.7068s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2613s - [COUNTERS] Fortran MEs ( 1 ) : 4.4454s for 8192 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6921s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2633s + [COUNTERS] Fortran MEs ( 1 ) : 4.4288s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.7028s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s - [COUNTERS] Fortran MEs ( 1 ) : 4.4449s for 8192 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6873s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2561s + [COUNTERS] Fortran MEs ( 1 ) : 4.4311s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.7520s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8982s - [COUNTERS] Fortran MEs ( 1 ) : 48.8538s for 90112 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.6939s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8954s + [COUNTERS] Fortran MEs ( 1 ) : 48.7985s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,34 +125,42 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - -Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. - -Backtrace for this error: -#0 0x7f191ce23860 in ??? -#1 0x7f191ce22a05 in ??? -#2 0x7f191ca54def in ??? -#3 0x44b5ff in ??? -#4 0x4087df in ??? -#5 0x409848 in ??? -#6 0x40bb83 in ??? -#7 0x40d1a9 in ??? -#8 0x45c804 in ??? -#9 0x434269 in ??? -#10 0x40371e in ??? -#11 0x7f191ca3feaf in ??? -#12 0x7f191ca3ff5f in ??? -#13 0x403844 in ??? -#14 0xffffffffffffffff in ??? -./madX.sh: line 388: 1459209 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' failed - Running at P P machine @ 13000.000000000000 GeV - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis - - - getting user params -Enter number of events and max and min iterations: + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.4632 [0.46320556893412546] fbridge_mode=1 + [UNWEIGHT] Wrote 11 events (found 168 events) + [COUNTERS] PROGRAM TOTAL : 9.3537s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7437s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6100s for 8192 events => throughput is 1.78E+03 events/s + +*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.46320556621222242) and cpp (0.46320556893412546) differ by less than 2E-4 (5.876231279344779e-09) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! +diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/events.lhe.ref.1 | head -20 +6,8c6,8 +< -6 1 1 2 0 503 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +< 21 1 1 2 504 501 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +< 21 1 1 2 505 504 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +--- +> -6 1 1 2 0 504 0.18965250326E+03 -0.37597274505E+02 0.12649008736E+03 0.28863535688E+03 0.17300000000E+03 0. 1. +> 21 1 1 2 504 503 0.62170885397E+02 0.36618395894E+02 0.31153079182E+02 0.78591604204E+02 0.00000000000E+00 0. 1. +> 21 1 1 2 505 501 0.17333851786E+01 0.11630357128E+03 0.45398068655E+02 0.12486196360E+03 0.00000000000E+00 0. 1. +20c20 +< 21 -1 0 0 501 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +--- +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.12305922681E+04 0.12305922681E+04 0.00000000000E+00 0. 1. +22c22 +< -6 1 1 2 0 502 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +--- +> -6 1 1 2 0 504 -0.16776755257E+03 -0.12342442113E+03 -0.43168412413E+03 0.50956817253E+03 0.17300000000E+03 0. 1. +24c24 +< 21 1 1 2 505 504 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. +--- +> 21 1 1 2 505 501 0.14318120879E+02 0.15600982705E+02 -0.82469087380E+02 0.85144287067E+02 0.00000000000E+00 0. -1. diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 22c2975bdd..b228907f76 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,13 +1,13 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_08:37:10 +DATE: 2024-06-03_02:39:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6658s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6564s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6671s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6577s + [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3931s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3835s - [COUNTERS] Fortran MEs ( 1 ) : 0.0096s for 8192 events => throughput is 8.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3925s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3832s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.81E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4439s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3427s - [COUNTERS] Fortran MEs ( 1 ) : 0.1012s for 90112 events => throughput is 8.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4414s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3401s + [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,34 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' - -Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. - -Backtrace for this error: -#0 0x7fee2dc23860 in ??? -#1 0x7fee2dc22a05 in ??? -#2 0x7fee2d854def in ??? -#3 0x43809f in ??? -#4 0x40581f in ??? -#5 0x4067b1 in ??? -#6 0x408c71 in ??? -#7 0x40a0a9 in ??? -#8 0x444fdf in ??? -#9 0x42bb38 in ??? -#10 0x40371e in ??? -#11 0x7fee2d83feaf in ??? -#12 0x7fee2d83ff5f in ??? -#13 0x403844 in ??? -#14 0xffffffffffffffff in ??? -./madX.sh: line 388: 1459682 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' failed - mdl_conjg__NN3x2 = (8.77004900000000059E-002,-0.0000000000000000) - mdl_conjg__NN3x3 = (0.69587750000000004,-0.0000000000000000) - mdl_conjg__NN3x4 = (0.71022700000000005,-0.0000000000000000) - mdl_conjg__NN4x1 = (-0.11650710000000000,-0.0000000000000000) - mdl_conjg__NN4x2 = (0.31073899999999999,-0.0000000000000000) - mdl_conjg__NN4x3 = (0.64922599999999997,-0.0000000000000000) - mdl_conjg__NN4x4 = (-0.68437780000000004,-0.0000000000000000) - mdl_conjg__UU1x1 = (0.91683490000000001,-0.0000000000000000) - mdl_conjg__UU1x2 = (-0.39926660000000003,-0.0000000000000000) - mdl_conjg__UU2x1 = (0.39926660000000003,-0.0000000000000000) + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] ERROR! No cross section in log file: + /tmp/avalassi/output_susyggt1t1_x1_cudacpp + ... +xqcutij # 3> 0.0 0.0 + RESET CUMULATIVE VARIABLE + NGOODHEL = 4 + NCOMB = 4 + MULTI_CHANNEL = TRUE + CHANNEL_ID = 3 + RESET CUMULATIVE VARIABLE + 4096 points passed the cut but all returned zero + therefore considering this contribution as zero + Deleting file events.lhe diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index f86a3fb9ca..7dc3f4f16e 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_08:37:15 +DATE: 2024-06-03_02:39:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6651s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6557s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6704s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6610s + [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3932s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3839s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3924s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4413s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3404s - [COUNTERS] Fortran MEs ( 1 ) : 0.1009s for 90112 events => throughput is 8.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4420s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3407s + [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,34 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' - -Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. - -Backtrace for this error: -#0 0x7ffb61823860 in ??? -#1 0x7ffb61822a05 in ??? -#2 0x7ffb61454def in ??? -#3 0x43809f in ??? -#4 0x40581f in ??? -#5 0x4067b1 in ??? -#6 0x408c71 in ??? -#7 0x40a0a9 in ??? -#8 0x444fdf in ??? -#9 0x42bb38 in ??? -#10 0x40371e in ??? -#11 0x7ffb6143feaf in ??? -#12 0x7ffb6143ff5f in ??? -#13 0x403844 in ??? -#14 0xffffffffffffffff in ??? -./madX.sh: line 388: 1460160 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' failed - mdl_conjg__NN3x2 = (8.77004900000000059E-002,-0.0000000000000000) - mdl_conjg__NN3x3 = (0.69587750000000004,-0.0000000000000000) - mdl_conjg__NN3x4 = (0.71022700000000005,-0.0000000000000000) - mdl_conjg__NN4x1 = (-0.11650710000000000,-0.0000000000000000) - mdl_conjg__NN4x2 = (0.31073899999999999,-0.0000000000000000) - mdl_conjg__NN4x3 = (0.64922599999999997,-0.0000000000000000) - mdl_conjg__NN4x4 = (-0.68437780000000004,-0.0000000000000000) - mdl_conjg__UU1x1 = (0.91683490000000001,-0.0000000000000000) - mdl_conjg__UU1x2 = (-0.39926660000000003,-0.0000000000000000) - mdl_conjg__UU2x1 = (0.39926660000000003,-0.0000000000000000) + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] ERROR! No cross section in log file: + /tmp/avalassi/output_susyggt1t1_x1_cudacpp + ... +xqcutij # 3> 0.0 0.0 + RESET CUMULATIVE VARIABLE + NGOODHEL = 4 + NCOMB = 4 + MULTI_CHANNEL = TRUE + CHANNEL_ID = 3 + RESET CUMULATIVE VARIABLE + 4096 points passed the cut but all returned zero + therefore considering this contribution as zero + Deleting file events.lhe diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index fdd705f7f1..89c65c015c 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -13,14 +13,14 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_08:37:20 +DATE: 2024-06-03_02:39:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6646s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6551s - [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6710s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6616s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3943s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3850s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.79E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,8 +108,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4422s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3410s + [COUNTERS] PROGRAM TOTAL : 1.4406s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3394s [COUNTERS] Fortran MEs ( 1 ) : 0.1012s for 90112 events => throughput is 8.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** @@ -125,34 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' - -Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation. - -Backtrace for this error: -#0 0x7f789d023860 in ??? -#1 0x7f789d022a05 in ??? -#2 0x7f789cc54def in ??? -#3 0x43809f in ??? -#4 0x40581f in ??? -#5 0x4067b1 in ??? -#6 0x408c71 in ??? -#7 0x40a0a9 in ??? -#8 0x444fdf in ??? -#9 0x42bb38 in ??? -#10 0x40371e in ??? -#11 0x7f789cc3feaf in ??? -#12 0x7f789cc3ff5f in ??? -#13 0x403844 in ??? -#14 0xffffffffffffffff in ??? -./madX.sh: line 388: 1460638 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' failed - mdl_conjg__NN3x2 = (8.77004900000000059E-002,-0.0000000000000000) - mdl_conjg__NN3x3 = (0.69587750000000004,-0.0000000000000000) - mdl_conjg__NN3x4 = (0.71022700000000005,-0.0000000000000000) - mdl_conjg__NN4x1 = (-0.11650710000000000,-0.0000000000000000) - mdl_conjg__NN4x2 = (0.31073899999999999,-0.0000000000000000) - mdl_conjg__NN4x3 = (0.64922599999999997,-0.0000000000000000) - mdl_conjg__NN4x4 = (-0.68437780000000004,-0.0000000000000000) - mdl_conjg__UU1x1 = (0.91683490000000001,-0.0000000000000000) - mdl_conjg__UU1x2 = (-0.39926660000000003,-0.0000000000000000) - mdl_conjg__UU2x1 = (0.39926660000000003,-0.0000000000000000) + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] ERROR! No cross section in log file: + /tmp/avalassi/output_susyggt1t1_x1_cudacpp + ... +xqcutij # 3> 0.0 0.0 + RESET CUMULATIVE VARIABLE + NGOODHEL = 4 + NCOMB = 4 + MULTI_CHANNEL = TRUE + CHANNEL_ID = 3 + RESET CUMULATIVE VARIABLE + 4096 points passed the cut but all returned zero + therefore considering this contribution as zero + Deleting file events.lhe From f9fceaeec68b58778978743ff5f521dae923e3e1 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 17:28:42 +0200 Subject: [PATCH 22/33] [tmad] regenerate all processes, check that all is ok --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 20 ++-- .../CODEGEN_cudacpp_ee_mumu_log.txt | 8 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 ++-- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 +-- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 24 ++--- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 16 ++-- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 8 +- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 22 ++--- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 12 +-- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 20 ++-- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 12 +-- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 22 ++--- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 12 +-- .../CODEGEN_mad_heft_gg_bb_log.txt | 16 ++-- .../CODEGEN_cudacpp_heft_gg_bb_log.txt | 10 +- .../CODEGEN_mad_pp_tt012j_log.txt | 92 +++++++++---------- .../CODEGEN_mad_smeft_gg_tttt_log.txt | 24 ++--- .../CODEGEN_cudacpp_smeft_gg_tttt_log.txt | 16 ++-- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 16 ++-- .../CODEGEN_cudacpp_susy_gg_t1t1_log.txt | 10 +- .../CODEGEN_mad_susy_gg_tt_log.txt | 16 ++-- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 8 +- 22 files changed, 206 insertions(+), 206 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 98652fbd7d..4ec188cadf 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005783796310424805  +DEBUG: model prefixing takes 0.005849361419677734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -176,8 +176,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,19 +194,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.104 s +Wrote files for 8 helas calls in 0.105 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.220 s +ALOHA: aloha creates 3 routines in 0.211 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.270 s +ALOHA: aloha creates 7 routines in 0.269 s FFV1 FFV1 FFV2 @@ -252,10 +252,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.429s -user 0m1.885s -sys 0m0.270s -Code generation completed in 3 seconds +real 0m2.156s +user 0m1.872s +sys 0m0.271s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 0901cfe618..c64f078d3d 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005842447280883789  +DEBUG: model prefixing takes 0.005849123001098633  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.772s -user 0m0.647s -sys 0m0.048s +real 0m0.786s +user 0m0.646s +sys 0m0.059s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 7b00f92838..cfb6edb459 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005838871002197266  +DEBUG: model prefixing takes 0.005819082260131836  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -198,12 +198,12 @@ Wrote files for 10 helas calls in 0.108 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.157 s +ALOHA: aloha creates 2 routines in 0.155 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.142 s +ALOHA: aloha creates 4 routines in 0.140 s VVV1 FFV1 FFV1 @@ -241,9 +241,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.994s -user 0m1.722s -sys 0m0.273s +real 0m1.985s +user 0m1.713s +sys 0m0.272s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 060ebb1d1d..f936b93678 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005850076675415039  +DEBUG: model prefixing takes 0.0058116912841796875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -183,7 +183,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.153 s +ALOHA: aloha creates 2 routines in 0.152 s VVV1 FFV1 FFV1 @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.562s -user 0m0.507s -sys 0m0.051s -Code generation completed in 1 seconds +real 0m0.570s +user 0m0.504s +sys 0m0.055s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 63e5119ac4..356f172947 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00590205192565918  +DEBUG: model prefixing takes 0.005854129791259766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -187,8 +187,8 @@ INFO: Processing color information for process: g g > t t~ g @2 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -204,8 +204,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -221,14 +221,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s -Wrote files for 46 helas calls in 0.257 s +Wrote files for 46 helas calls in 0.262 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.347 s +ALOHA: aloha creates 5 routines in 0.352 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -236,7 +236,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.328 s +ALOHA: aloha creates 10 routines in 0.336 s VVV1 VVV1 FFV1 @@ -287,10 +287,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.751s -user 0m2.448s -sys 0m0.300s -Code generation completed in 2 seconds +real 0m2.781s +user 0m2.445s +sys 0m0.324s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 917aa6b5ee..89d5e42a2e 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058138370513916016  +DEBUG: model prefixing takes 0.005921602249145508  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,7 +194,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -Wrote files for 36 helas calls in 0.159 s +Wrote files for 36 helas calls in 0.158 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.330 s +ALOHA: aloha creates 10 routines in 0.332 s VVV1 VVV1 FFV1 @@ -256,9 +256,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.570s -user 0m2.293s -sys 0m0.277s +real 0m2.587s +user 0m2.277s +sys 0m0.294s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index c27b731327..f7ba14f214 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00583338737487793  +DEBUG: model prefixing takes 0.0058133602142333984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.869s -user 0m0.766s -sys 0m0.055s +real 0m0.827s +user 0m0.769s +sys 0m0.053s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index e2c6e055a3..849d5b6525 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005956411361694336  +DEBUG: model prefixing takes 0.005772829055786133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.171 s +1 processes with 123 diagrams generated in 0.169 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,14 +194,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg Generated helas calls for 1 subprocesses (123 diagrams) in 0.456 s -Wrote files for 222 helas calls in 0.744 s +Wrote files for 222 helas calls in 0.738 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.354 s +ALOHA: aloha creates 5 routines in 0.352 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.334 s +ALOHA: aloha creates 10 routines in 0.342 s VVV1 VVV1 FFV1 @@ -259,10 +259,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m4.109s -user 0m3.737s -sys 0m0.281s -Code generation completed in 4 seconds +real 0m5.259s +user 0m3.713s +sys 0m0.289s +Code generation completed in 5 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index db0459398d..903ee6a21e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005809783935546875  +DEBUG: model prefixing takes 0.005794525146484375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.168 s +1 processes with 123 diagrams generated in 0.170 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.458 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.455 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.341 s +ALOHA: aloha creates 5 routines in 0.337 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.536s -user 0m1.466s +real 0m1.530s +user 0m1.456s sys 0m0.061s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index b4219bcd39..476a42fed0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005804300308227539  +DEBUG: model prefixing takes 0.005855083465576172  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.997 s +1 processes with 1240 diagrams generated in 1.990 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -179,8 +179,8 @@ INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,15 +195,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 7.030 s -Wrote files for 2281 helas calls in 19.751 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.986 s +Wrote files for 2281 helas calls in 19.681 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.340 s +ALOHA: aloha creates 5 routines in 0.338 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -261,9 +261,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m34.977s -user 0m34.181s -sys 0m0.473s +real 0m34.814s +user 0m34.082s +sys 0m0.442s Code generation completed in 35 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 2b891973d3..ee8521e020 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005815029144287109  +DEBUG: model prefixing takes 0.0058100223541259766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.016 s +1 processes with 1240 diagrams generated in 1.992 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.983 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 7.031 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m14.131s -user 0m13.685s -sys 0m0.117s +real 0m13.928s +user 0m13.725s +sys 0m0.126s Code generation completed in 14 seconds diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index fb1c56fdea..44f0debd5b 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005858421325683594  +DEBUG: model prefixing takes 0.005815982818603516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -200,8 +200,8 @@ INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,8 +217,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -234,7 +234,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s -Wrote files for 32 helas calls in 0.235 s +Wrote files for 32 helas calls in 0.232 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines @@ -243,7 +243,7 @@ ALOHA: aloha creates 2 routines in 0.156 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.141 s +ALOHA: aloha creates 4 routines in 0.143 s FFV1 FFV1 FFV1 @@ -298,10 +298,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.457s -user 0m2.036s -sys 0m0.309s -Code generation completed in 3 seconds +real 0m3.796s +user 0m2.043s +sys 0m0.293s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index eeca578667..d9ee132bf2 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005826473236083984  +DEBUG: model prefixing takes 0.005840778350830078  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.084 s +8 processes with 40 diagrams generated in 0.083 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -215,7 +215,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.154 s +ALOHA: aloha creates 2 routines in 0.153 s FFV1 FFV1 FFV1 @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.693s -user 0m0.629s -sys 0m0.059s +real 0m0.702s +user 0m0.619s +sys 0m0.058s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index a37dd33635..b2a820e487 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -149,8 +149,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Creating files in directory P1_gg_bbx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -166,7 +166,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s -Wrote files for 12 helas calls in 0.110 s +Wrote files for 12 helas calls in 0.111 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 @@ -179,7 +179,7 @@ ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.263 s +ALOHA: aloha creates 8 routines in 0.262 s VVS3 VVV1 FFV1 @@ -219,10 +219,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.443s -user 0m1.976s -sys 0m0.266s -Code generation completed in 2 seconds +real 0m2.250s +user 0m1.965s +sys 0m0.278s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 9d498faa46..dce6286d40 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -157,7 +157,7 @@ ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.283 s +ALOHA: aloha creates 4 routines in 0.279 s VVS3 VVV1 FFV1 @@ -174,7 +174,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.887s -user 0m0.624s -sys 0m0.057s -Code generation completed in 0 seconds +real 0m2.095s +user 0m0.614s +sys 0m0.059s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index c999c8e3d4..89b4bd51fa 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00587010383605957  +DEBUG: model prefixing takes 0.00579380989074707  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.145 s +13 processes with 76 diagrams generated in 0.144 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.957 s +65 processes with 1119 diagrams generated in 1.940 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -499,8 +499,8 @@ INFO: Combined process c c~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -516,8 +516,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -533,8 +533,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -550,8 +550,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -567,8 +567,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -584,8 +584,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -601,8 +601,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -618,8 +618,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -635,8 +635,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -652,8 +652,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -669,8 +669,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -686,8 +686,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -703,8 +703,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -720,8 +720,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -737,8 +737,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -754,8 +754,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -771,8 +771,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -788,8 +788,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,15 +804,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.373 s -Wrote files for 810 helas calls in 3.476 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.383 s +Wrote files for 810 helas calls in 4.486 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.365 s +ALOHA: aloha creates 5 routines in 0.361 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -820,7 +820,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 1.909 s +ALOHA: aloha creates 10 routines in 0.329 s VVV1 VVV1 FFV1 @@ -1032,9 +1032,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m13.190s -user 0m10.662s -sys 0m0.902s +real 0m13.329s +user 0m10.572s +sys 0m0.946s Code generation completed in 13 seconds ************************************************************ * * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 249702623d..577a53a429 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.1493072509765625  +DEBUG: model prefixing takes 0.14798188209533691  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.966 s +1 processes with 72 diagrams generated in 3.952 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -114,8 +114,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 INFO: Creating files in directory P1_gg_ttxttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -130,15 +130,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -Generated helas calls for 1 subprocesses (72 diagrams) in 0.199 s -Wrote files for 119 helas calls in 0.443 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.200 s +Wrote files for 119 helas calls in 0.449 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.338 s +ALOHA: aloha creates 5 routines in 0.337 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -146,7 +146,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.353 s +ALOHA: aloha creates 10 routines in 0.351 s VVV5 VVV5 FFV1 @@ -193,10 +193,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m7.672s -user 0m7.285s -sys 0m0.320s -Code generation completed in 7 seconds +real 0m8.031s +user 0m7.273s +sys 0m0.312s +Code generation completed in 8 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index aba71fa3d5..159f44d59b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.14808392524719238  +DEBUG: model prefixing takes 0.14763712882995605  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.955 s +1 processes with 72 diagrams generated in 3.951 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.200 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.198 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -123,7 +123,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.335 s +ALOHA: aloha creates 5 routines in 0.336 s VVV5 VVV5 FFV1 @@ -143,7 +143,7 @@ INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SME INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m5.428s -user 0m5.332s -sys 0m0.075s -Code generation completed in 5 seconds +real 0m5.433s +user 0m5.333s +sys 0m0.069s +Code generation completed in 6 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 16c822599f..3d8fbc32ca 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.132 s +1 processes with 6 diagrams generated in 0.131 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -576,8 +576,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -604,7 +604,7 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.194 s +ALOHA: aloha creates 6 routines in 0.192 s VVV1 VSS1 VSS1 @@ -647,10 +647,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.624s -user 0m2.849s -sys 0m0.311s -Code generation completed in 4 seconds +real 0m3.497s +user 0m2.824s +sys 0m0.326s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 428621a368..829b9ab7c7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.130 s +1 processes with 6 diagrams generated in 0.131 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Load PLUGIN.CUDACPP_OUTPUT @@ -583,7 +583,7 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.194 s +ALOHA: aloha creates 3 routines in 0.203 s VVV1 VSS1 VSS1 @@ -599,7 +599,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.520s -user 0m1.334s -sys 0m0.076s +real 0m1.431s +user 0m1.338s +sys 0m0.069s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index abc4abb141..6a8c405b07 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.126 s +1 processes with 3 diagrams generated in 0.127 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -576,8 +576,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,12 +592,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s Wrote files for 10 helas calls in 0.110 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 @@ -640,10 +640,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.607s -user 0m2.713s +real 0m3.015s +user 0m2.709s sys 0m0.297s -Code generation completed in 4 seconds +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 9f7d8a4530..86c6a6a716 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.128 s +1 processes with 3 diagrams generated in 0.126 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.906s -user 0m1.287s -sys 0m0.069s +real 0m1.401s +user 0m1.289s +sys 0m0.064s Code generation completed in 2 seconds From f174f4f854f9344ec96b8146a7a6627aeb155eb4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 18:48:02 +0200 Subject: [PATCH 23/33] [tmad] rerun 102 tput tests on itscrd90 - all ok (after merging #860 and #850 for Ccoeff #825) STARTED AT Mon Jun 3 05:51:20 PM CEST 2024 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Mon Jun 3 06:12:34 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Mon Jun 3 06:20:51 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Mon Jun 3 06:29:05 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Mon Jun 3 06:31:55 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Mon Jun 3 06:34:42 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ENDED(6) AT Mon Jun 3 06:37:37 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -mix -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb -makeclean ENDED(7) AT Mon Jun 3 06:47:12 PM CEST 2024 [Status=0] No errors found in logs --- .../log_eemumu_mad_d_inl0_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_common.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd1.txt | 96 ++++++------ .../log_eemumu_mad_d_inl1_hrd0.txt | 94 +++++------ .../log_eemumu_mad_d_inl1_hrd1.txt | 94 +++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_common.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd1.txt | 94 +++++------ .../log_eemumu_mad_f_inl1_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_f_inl1_hrd1.txt | 94 +++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_m_inl0_hrd1.txt | 94 +++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_common.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd1.txt | 96 ++++++------ .../log_ggtt_mad_d_inl1_hrd0.txt | 104 ++++++------ .../log_ggtt_mad_d_inl1_hrd1.txt | 92 +++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 124 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 122 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 122 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 124 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 122 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 124 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 110 ++++++------- .../log_ggtt_mad_f_inl1_hrd1.txt | 108 ++++++------- .../log_ggtt_mad_m_inl0_hrd0.txt | 96 ++++++------ .../log_ggtt_mad_m_inl0_hrd1.txt | 96 ++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 118 +++++++------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 118 +++++++------- .../log_ggttg_mad_d_inl0_hrd1.txt | 118 +++++++------- .../log_ggttg_mad_f_inl0_hrd0.txt | 148 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 148 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd1.txt | 148 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_ggttg_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 114 +++++++------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 114 +++++++------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 134 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 136 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 136 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 134 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 136 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 134 ++++++++-------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 142 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 142 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 110 ++++++------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 110 ++++++------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 110 ++++++------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 138 ++++++++-------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 140 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 138 ++++++++-------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_gqttq_mad_d_inl0_hrd0.txt | 130 +++++++-------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 130 +++++++-------- .../log_gqttq_mad_d_inl0_hrd1.txt | 130 +++++++-------- .../log_gqttq_mad_f_inl0_hrd0.txt | 142 ++++++++--------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 144 ++++++++--------- .../log_gqttq_mad_f_inl0_hrd1.txt | 142 ++++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_gqttq_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_heftggbb_mad_d_inl0_hrd0.txt | 100 ++++++------ .../log_heftggbb_mad_d_inl0_hrd1.txt | 100 ++++++------ .../log_heftggbb_mad_f_inl0_hrd0.txt | 126 ++++++++------- .../log_heftggbb_mad_f_inl0_hrd1.txt | 122 +++++++-------- .../log_heftggbb_mad_m_inl0_hrd0.txt | 96 ++++++------ .../log_heftggbb_mad_m_inl0_hrd1.txt | 96 ++++++------ .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 110 ++++++------- .../log_smeftggtttt_mad_d_inl0_hrd1.txt | 110 ++++++------- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 122 +++++++-------- .../log_smeftggtttt_mad_f_inl0_hrd1.txt | 122 +++++++-------- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_smeftggtttt_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggt1t1_mad_d_inl0_hrd1.txt | 144 ++++++++--------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 146 ++++++++--------- .../log_susyggt1t1_mad_f_inl0_hrd1.txt | 146 ++++++++--------- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggt1t1_mad_m_inl0_hrd1.txt | 144 ++++++++--------- .../log_susyggtt_mad_d_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggtt_mad_d_inl0_hrd1.txt | 144 ++++++++--------- .../log_susyggtt_mad_f_inl0_hrd0.txt | 146 ++++++++--------- .../log_susyggtt_mad_f_inl0_hrd1.txt | 146 ++++++++--------- .../log_susyggtt_mad_m_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggtt_mad_m_inl0_hrd1.txt | 144 ++++++++--------- 102 files changed, 5969 insertions(+), 5973 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index de0caca761..56ff30cafc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:48:44 +DATE: 2024-06-03_17:55:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.447081e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.931434e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.173062e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.381319e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.697367e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143178e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.676046 sec +TOTAL : 0.737241 sec INFO: No Floating Point Exceptions have been reported - 2,567,759,777 cycles # 2.819 GHz - 3,947,530,526 instructions # 1.54 insn per cycle - 0.969595478 seconds time elapsed + 2,630,475,030 cycles # 2.802 GHz + 4,094,804,374 instructions # 1.56 insn per cycle + 1.031371639 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052568e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.236916e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.236916e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.012540e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.179773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.179773e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.401690 sec +TOTAL : 6.634856 sec INFO: No Floating Point Exceptions have been reported - 18,320,184,384 cycles # 2.860 GHz - 43,970,344,438 instructions # 2.40 insn per cycle - 6.407522814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 19,043,816,100 cycles # 2.871 GHz + 46,110,907,096 instructions # 2.42 insn per cycle + 6.640203777 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.556597e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.031950e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.031950e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.536456e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.001382e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.001382e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.466648 sec +TOTAL : 4.519211 sec INFO: No Floating Point Exceptions have been reported - 12,746,464,526 cycles # 2.851 GHz - 30,998,051,748 instructions # 2.43 insn per cycle - 4.472203598 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 12,941,312,867 cycles # 2.861 GHz + 31,615,854,685 instructions # 2.44 insn per cycle + 4.524499594 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.919243e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.664717e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.664717e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.916088e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660561e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660561e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.699016 sec +TOTAL : 3.706215 sec INFO: No Floating Point Exceptions have been reported - 10,057,139,705 cycles # 2.715 GHz - 19,364,699,903 instructions # 1.93 insn per cycle - 3.704443201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 10,084,827,662 cycles # 2.718 GHz + 19,615,618,896 instructions # 1.95 insn per cycle + 3.711412090 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993729e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.809982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.809982e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.942239e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.710200e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.710200e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.577215 sec +TOTAL : 3.659343 sec INFO: No Floating Point Exceptions have been reported - 9,735,076,070 cycles # 2.718 GHz - 18,976,322,211 instructions # 1.95 insn per cycle - 3.583082575 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 9,850,079,029 cycles # 2.688 GHz + 19,274,334,982 instructions # 1.96 insn per cycle + 3.664605897 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.660221e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.180702e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.180702e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.674322e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201468e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.214341 sec +TOTAL : 4.178974 sec INFO: No Floating Point Exceptions have been reported - 8,602,295,276 cycles # 2.039 GHz - 15,727,245,583 instructions # 1.83 insn per cycle - 4.219911758 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 8,571,642,787 cycles # 2.049 GHz + 15,729,577,049 instructions # 1.84 insn per cycle + 4.184518706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 7ea10d000a..e4612fa859 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:26:15 +DATE: 2024-06-03_18:23:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.576625e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.738660e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.738660e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.676827e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.079718e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.079718e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.288746 sec +TOTAL : 2.257615 sec INFO: No Floating Point Exceptions have been reported - 7,214,350,790 cycles # 2.846 GHz - 12,908,181,952 instructions # 1.79 insn per cycle - 2.590853100 seconds time elapsed + 7,105,304,866 cycles # 2.844 GHz + 12,695,101,201 instructions # 1.79 insn per cycle + 2.557593286 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,16 +90,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.021503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.191283e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.191283e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.768920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.132392e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.132392e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.785821 sec +TOTAL : 7.067791 sec INFO: No Floating Point Exceptions have been reported - 19,486,737,909 cycles # 2.870 GHz - 44,194,389,028 instructions # 2.27 insn per cycle - 6.792710044 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 20,274,216,454 cycles # 2.867 GHz + 46,340,905,474 instructions # 2.29 insn per cycle + 7.074366711 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -119,16 +119,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.480951e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909694e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909694e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.459618e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.878889e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.878889e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.890119 sec +TOTAL : 4.950844 sec INFO: No Floating Point Exceptions have been reported - 14,036,550,987 cycles # 2.867 GHz - 31,841,545,843 instructions # 2.27 insn per cycle - 4.897057684 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 14,186,861,569 cycles # 2.862 GHz + 32,460,104,656 instructions # 2.29 insn per cycle + 4.957781944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -148,16 +148,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.794479e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.441373e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.441373e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.784896e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.422990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.422990e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.161489 sec +TOTAL : 4.177427 sec INFO: No Floating Point Exceptions have been reported - 11,384,673,501 cycles # 2.732 GHz - 20,728,132,603 instructions # 1.82 insn per cycle - 4.168544668 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 11,344,391,753 cycles # 2.712 GHz + 20,973,942,956 instructions # 1.85 insn per cycle + 4.184302710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -177,16 +177,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.875462e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.582851e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.582851e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.838122e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.522202e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.522202e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.002181 sec +TOTAL : 4.071273 sec INFO: No Floating Point Exceptions have been reported - 10,994,800,793 cycles # 2.743 GHz - 20,338,605,981 instructions # 1.85 insn per cycle - 4.009209282 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 11,127,064,908 cycles # 2.729 GHz + 20,623,612,207 instructions # 1.85 insn per cycle + 4.078239476 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -206,16 +206,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.543831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.990093e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.990093e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.580968e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.045754e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.045754e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.737577 sec +TOTAL : 4.634026 sec INFO: No Floating Point Exceptions have been reported - 9,979,903,374 cycles # 2.116 GHz - 16,882,096,595 instructions # 1.69 insn per cycle - 4.744772163 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 9,906,245,376 cycles # 2.135 GHz + 16,874,719,580 instructions # 1.70 insn per cycle + 4.640929218 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 5164f42c9d..d50297454d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:37:21 +DATE: 2024-06-03_18:34:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.007128e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.848481e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.122082e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.267055e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.830379e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.124220e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.379492 sec +TOTAL : 1.364180 sec INFO: No Floating Point Exceptions have been reported - 4,577,420,690 cycles # 2.842 GHz - 7,053,134,533 instructions # 1.54 insn per cycle - 1.667006809 seconds time elapsed + 4,532,864,593 cycles # 2.840 GHz + 7,022,211,486 instructions # 1.55 insn per cycle + 1.651988516 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053204e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.237226e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.237226e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.009353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.177013e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177013e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.775998 sec +TOTAL : 7.030647 sec INFO: No Floating Point Exceptions have been reported - 19,412,850,180 cycles # 2.863 GHz - 44,070,335,531 instructions # 2.27 insn per cycle - 6.781477582 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 20,161,768,446 cycles # 2.866 GHz + 46,214,693,694 instructions # 2.29 insn per cycle + 7.036306592 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.545449e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.023167e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.023167e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.533517e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003656e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003656e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.877650 sec +TOTAL : 4.909085 sec INFO: No Floating Point Exceptions have been reported - 13,891,433,885 cycles # 2.845 GHz - 31,001,668,128 instructions # 2.23 insn per cycle - 4.883211371 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 14,057,102,230 cycles # 2.861 GHz + 31,619,083,498 instructions # 2.25 insn per cycle + 4.914930210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.893576e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.631253e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.631253e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914115e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.674802e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.674802e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.124694 sec +TOTAL : 4.087877 sec INFO: No Floating Point Exceptions have been reported - 11,185,517,102 cycles # 2.709 GHz - 19,267,834,957 instructions # 1.72 insn per cycle - 4.130263002 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 11,204,101,414 cycles # 2.738 GHz + 19,516,255,163 instructions # 1.74 insn per cycle + 4.093488605 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993280e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.807930e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.807930e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.960651e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.761105e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.761105e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.962775 sec +TOTAL : 4.017981 sec INFO: No Floating Point Exceptions have been reported - 10,857,852,651 cycles # 2.737 GHz - 18,688,313,206 instructions # 1.72 insn per cycle - 3.968277953 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 11,005,784,213 cycles # 2.736 GHz + 18,974,990,518 instructions # 1.72 insn per cycle + 4.023446242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.669707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.199425e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.199425e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.671209e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.198815e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.198815e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.577580 sec +TOTAL : 4.571742 sec INFO: No Floating Point Exceptions have been reported - 9,715,691,578 cycles # 2.121 GHz - 15,431,480,999 instructions # 1.59 insn per cycle - 4.583151536 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 9,691,143,886 cycles # 2.118 GHz + 15,430,112,477 instructions # 1.59 insn per cycle + 4.577138966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index cd3c8cd8c3..5ad03591f2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:34:38 +DATE: 2024-06-03_18:31:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.026327e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.798452e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.142109e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.316519e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.874601e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.146758e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.998322 sec +TOTAL : 0.987264 sec INFO: No Floating Point Exceptions have been reported - 3,480,084,828 cycles # 2.833 GHz - 7,034,366,070 instructions # 2.02 insn per cycle - 1.285315972 seconds time elapsed + 3,447,960,914 cycles # 2.834 GHz + 6,935,540,646 instructions # 2.01 insn per cycle + 1.273699042 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051590e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.235793e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.235793e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.009012e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.176925e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.176925e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.406221 sec +TOTAL : 6.658869 sec INFO: No Floating Point Exceptions have been reported - 18,351,317,436 cycles # 2.863 GHz - 43,970,738,136 instructions # 2.40 insn per cycle - 6.411801951 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 19,081,258,056 cycles # 2.864 GHz + 46,107,329,358 instructions # 2.42 insn per cycle + 6.664506569 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.546913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.022660e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.022660e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.514278e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.976583e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.976583e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.494676 sec +TOTAL : 4.584116 sec INFO: No Floating Point Exceptions have been reported - 12,818,589,929 cycles # 2.849 GHz - 31,001,594,318 instructions # 2.42 insn per cycle - 4.500171814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 13,015,516,432 cycles # 2.837 GHz + 31,615,544,326 instructions # 2.43 insn per cycle + 4.589745993 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.924966e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.685276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.685276e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.910581e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.667694e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.667694e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.690851 sec +TOTAL : 3.717580 sec INFO: No Floating Point Exceptions have been reported - 10,087,697,509 cycles # 2.730 GHz - 19,365,345,065 instructions # 1.92 insn per cycle - 3.696395456 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 10,129,644,926 cycles # 2.722 GHz + 19,615,817,348 instructions # 1.94 insn per cycle + 3.723306833 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.949695e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.742629e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.742629e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.703083e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.703083e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652825 sec +TOTAL : 3.709507 sec INFO: No Floating Point Exceptions have been reported - 9,810,916,380 cycles # 2.684 GHz - 18,988,601,654 instructions # 1.94 insn per cycle - 3.658397686 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 10,001,772,202 cycles # 2.693 GHz + 19,274,333,460 instructions # 1.93 insn per cycle + 3.715129775 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.670977e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.205176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.205176e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.675494e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.205413e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.205413e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.189326 sec +TOTAL : 4.178251 sec INFO: No Floating Point Exceptions have been reported - 8,618,980,631 cycles # 2.055 GHz - 15,727,806,217 instructions # 1.82 insn per cycle - 4.194947819 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 8,596,800,326 cycles # 2.055 GHz + 15,729,129,551 instructions # 1.83 insn per cycle + 4.183796011 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 3d612f0f8f..9f94eba974 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:31:52 +DATE: 2024-06-03_18:29:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.966609e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.692288e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.029334e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.059269e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.762127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.035370e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.920797 sec +TOTAL : 1.898879 sec INFO: No Floating Point Exceptions have been reported - 6,106,858,009 cycles # 2.839 GHz - 11,364,477,574 instructions # 1.86 insn per cycle - 2.208478557 seconds time elapsed + 6,080,782,058 cycles # 2.845 GHz + 11,333,399,425 instructions # 1.86 insn per cycle + 2.193723946 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -83,16 +83,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.046138e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.230727e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.230727e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.011160e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180008e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180008e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.439007 sec +TOTAL : 6.644599 sec INFO: No Floating Point Exceptions have been reported - 18,389,458,277 cycles # 2.854 GHz - 43,970,512,226 instructions # 2.39 insn per cycle - 6.444728897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 19,072,626,755 cycles # 2.869 GHz + 46,106,399,416 instructions # 2.42 insn per cycle + 6.650013559 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.536098e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017790e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017790e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.537609e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.007819e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.007819e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.525302 sec +TOTAL : 4.519754 sec INFO: No Floating Point Exceptions have been reported - 12,893,450,701 cycles # 2.847 GHz - 31,000,830,473 instructions # 2.40 insn per cycle - 4.530866155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 12,966,403,150 cycles # 2.866 GHz + 31,615,629,074 instructions # 2.44 insn per cycle + 4.525378951 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -139,16 +139,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.928619e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.689762e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.689762e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.896438e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.633059e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.633059e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.685051 sec +TOTAL : 3.741423 sec INFO: No Floating Point Exceptions have been reported - 10,071,705,419 cycles # 2.730 GHz - 19,365,099,946 instructions # 1.92 insn per cycle - 3.690595449 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 10,139,443,489 cycles # 2.707 GHz + 19,615,728,681 instructions # 1.93 insn per cycle + 3.747032020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.001165e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.830714e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.830714e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.989752e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.800797e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.800797e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.567461 sec +TOTAL : 3.582955 sec INFO: No Floating Point Exceptions have been reported - 9,752,859,688 cycles # 2.730 GHz - 18,976,384,316 instructions # 1.95 insn per cycle - 3.573110488 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 9,847,950,220 cycles # 2.745 GHz + 19,262,900,417 instructions # 1.96 insn per cycle + 3.588451727 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -195,16 +195,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.669903e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201435e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201435e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.681253e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.222096e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.222096e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.193613 sec +TOTAL : 4.166168 sec INFO: No Floating Point Exceptions have been reported - 8,620,136,128 cycles # 2.053 GHz - 15,727,513,221 instructions # 1.82 insn per cycle - 4.199142686 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 8,643,873,393 cycles # 2.073 GHz + 15,729,348,046 instructions # 1.82 insn per cycle + 4.171863724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 3617e224c6..36620ecacd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:49:15 +DATE: 2024-06-03_17:55:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.362484e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.520610e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.206124e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.378306e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711623e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.204981e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.676929 sec +TOTAL : 0.690663 sec INFO: No Floating Point Exceptions have been reported - 2,569,510,705 cycles # 2.820 GHz - 4,003,501,539 instructions # 1.56 insn per cycle - 0.973315338 seconds time elapsed + 2,634,366,044 cycles # 2.824 GHz + 4,060,642,595 instructions # 1.54 insn per cycle + 0.990576887 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.103860e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.307114e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.307114e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.004096e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.168038e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.122439 sec +TOTAL : 6.686767 sec INFO: No Floating Point Exceptions have been reported - 17,532,692,730 cycles # 2.862 GHz - 41,814,035,675 instructions # 2.38 insn per cycle - 6.128383150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) + 19,167,690,225 cycles # 2.865 GHz + 46,069,035,215 instructions # 2.40 insn per cycle + 6.692184734 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.581686e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.085119e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.085119e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.537206e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.002103e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.002103e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.404931 sec +TOTAL : 4.518313 sec INFO: No Floating Point Exceptions have been reported - 12,515,101,521 cycles # 2.838 GHz - 30,161,142,578 instructions # 2.41 insn per cycle - 4.410559397 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) + 12,954,375,073 cycles # 2.864 GHz + 31,589,759,365 instructions # 2.44 insn per cycle + 4.523729432 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.946029e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.718574e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.718574e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.927185e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.677512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.677512e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.653893 sec +TOTAL : 3.686146 sec INFO: No Floating Point Exceptions have been reported - 9,961,431,996 cycles # 2.723 GHz - 19,096,639,277 instructions # 1.92 insn per cycle - 3.659578231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) + 10,059,405,191 cycles # 2.726 GHz + 19,593,879,777 instructions # 1.95 insn per cycle + 3.691447203 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1955) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.016173e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.848698e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.848698e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.952035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.725882e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.725882e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.542404 sec +TOTAL : 3.645301 sec INFO: No Floating Point Exceptions have been reported - 9,660,599,362 cycles # 2.725 GHz - 18,744,004,297 instructions # 1.94 insn per cycle - 3.547571851 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) + 9,902,836,539 cycles # 2.714 GHz + 19,290,819,520 instructions # 1.95 insn per cycle + 3.650635929 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.727713e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.296045e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296045e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.701058e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.251875e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.251875e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.062176 sec +TOTAL : 4.122795 sec INFO: No Floating Point Exceptions have been reported - 8,450,337,585 cycles # 2.078 GHz - 15,603,422,783 instructions # 1.85 insn per cycle - 4.067782201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) + 8,456,560,073 cycles # 2.049 GHz + 15,601,817,159 instructions # 1.84 insn per cycle + 4.128003753 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 886) (512y: 156) (512z: 1237) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 501b51f71f..451f1ae6b2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:16:43 +DATE: 2024-06-03_18:13:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.738504e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.732597e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.167044e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.826886e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.944644e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.167643e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.684793 sec +TOTAL : 0.667400 sec INFO: No Floating Point Exceptions have been reported - 2,601,107,988 cycles # 2.822 GHz - 4,061,282,635 instructions # 1.56 insn per cycle - 0.978559950 seconds time elapsed + 2,582,478,119 cycles # 2.831 GHz + 3,921,579,026 instructions # 1.52 insn per cycle + 0.972378951 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.575213e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.020448e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.020448e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.572927e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021325e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021325e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.416526 sec +TOTAL : 4.421917 sec INFO: No Floating Point Exceptions have been reported - 12,654,142,005 cycles # 2.862 GHz - 32,510,363,434 instructions # 2.57 insn per cycle - 4.422251656 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) + 12,670,048,279 cycles # 2.863 GHz + 32,460,799,656 instructions # 2.56 insn per cycle + 4.427297536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 294) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.002255e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.874219e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.874219e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.982168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.835238e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.835238e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.568265 sec +TOTAL : 3.599430 sec INFO: No Floating Point Exceptions have been reported - 10,224,593,553 cycles # 2.863 GHz - 24,472,095,992 instructions # 2.39 insn per cycle - 3.573538181 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) + 10,303,629,232 cycles # 2.859 GHz + 24,602,320,321 instructions # 2.39 insn per cycle + 3.604552236 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.169162e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.169863e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.169863e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.163312e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.175141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.175141e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.324119 sec +TOTAL : 3.330877 sec INFO: No Floating Point Exceptions have been reported - 9,111,176,688 cycles # 2.737 GHz - 16,922,082,397 instructions # 1.86 insn per cycle - 3.329746327 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) + 9,113,641,103 cycles # 2.732 GHz + 16,920,127,372 instructions # 1.86 insn per cycle + 3.336119061 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1630) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.220354e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.270284e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.270284e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.226331e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.302500e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.302500e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.254253 sec +TOTAL : 3.247561 sec INFO: No Floating Point Exceptions have been reported - 8,910,060,786 cycles # 2.734 GHz - 16,345,046,075 instructions # 1.83 insn per cycle - 3.260025356 seconds time elapsed + 8,894,526,436 cycles # 2.735 GHz + 16,333,311,875 instructions # 1.84 insn per cycle + 3.252816409 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.878740e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.571241e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.571241e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.866287e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.571770e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.571770e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.773398 sec +TOTAL : 3.797015 sec INFO: No Floating Point Exceptions have been reported - 7,901,326,876 cycles # 2.092 GHz - 14,582,511,484 instructions # 1.85 insn per cycle - 3.778605571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) + 7,945,914,620 cycles # 2.090 GHz + 14,570,610,289 instructions # 1.83 insn per cycle + 3.802243147 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1012) (512y: 158) (512z: 954) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index fa73177cd7..3a280ed2df 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:17:10 +DATE: 2024-06-03_18:14:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.732503e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.754634e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.215471e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.829864e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.970963e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217527e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.688215 sec +TOTAL : 0.675302 sec INFO: No Floating Point Exceptions have been reported - 2,639,635,060 cycles # 2.820 GHz - 4,015,174,984 instructions # 1.52 insn per cycle - 0.994462896 seconds time elapsed + 2,543,380,020 cycles # 2.802 GHz + 3,941,057,239 instructions # 1.55 insn per cycle + 0.974156954 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.083631e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.949296e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.949296e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.036422e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.861638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.861638e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.441397 sec +TOTAL : 3.511896 sec INFO: No Floating Point Exceptions have been reported - 9,846,812,518 cycles # 2.858 GHz - 25,386,191,431 instructions # 2.58 insn per cycle - 3.446476615 seconds time elapsed + 9,977,246,915 cycles # 2.837 GHz + 25,414,242,774 instructions # 2.55 insn per cycle + 3.517152120 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.316581e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.557276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.557276e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.321836e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.587383e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.587383e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.142273 sec +TOTAL : 3.134839 sec INFO: No Floating Point Exceptions have been reported - 8,991,441,142 cycles # 2.857 GHz - 21,484,440,131 instructions # 2.39 insn per cycle - 3.147929478 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) + 8,976,689,142 cycles # 2.860 GHz + 21,408,195,057 instructions # 2.38 insn per cycle + 3.139945644 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1112) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.320671e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.503530e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.503530e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.287559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.440355e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.440355e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.133297 sec +TOTAL : 3.172922 sec INFO: No Floating Point Exceptions have been reported - 8,580,721,113 cycles # 2.735 GHz - 15,811,719,082 instructions # 1.84 insn per cycle - 3.138961399 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) + 8,695,276,950 cycles # 2.737 GHz + 15,871,278,326 instructions # 1.83 insn per cycle + 3.178201663 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1503) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.375766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.617874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.617874e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.355752e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594201e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594201e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.069315 sec +TOTAL : 3.089945 sec INFO: No Floating Point Exceptions have been reported - 8,463,481,626 cycles # 2.754 GHz - 15,513,175,556 instructions # 1.83 insn per cycle - 3.074609069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) + 8,475,803,379 cycles # 2.739 GHz + 15,579,989,322 instructions # 1.84 insn per cycle + 3.095117552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1282) (512y: 141) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.011258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.824275e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.824275e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995463e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.811156e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.811156e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.549017 sec +TOTAL : 3.579131 sec INFO: No Floating Point Exceptions have been reported - 7,565,498,334 cycles # 2.129 GHz - 14,283,366,137 instructions # 1.89 insn per cycle - 3.554588261 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) + 7,587,197,159 cycles # 2.118 GHz + 14,284,005,803 instructions # 1.88 insn per cycle + 3.584393140 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 876) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 78b8b832b6..bcbeba2deb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:49:45 +DATE: 2024-06-03_17:56:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.453404e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.301856e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.286518e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.164643e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140650e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.147287e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.580838 sec +TOTAL : 0.590670 sec INFO: No Floating Point Exceptions have been reported - 2,286,209,161 cycles # 2.821 GHz - 3,532,764,689 instructions # 1.55 insn per cycle - 0.869255178 seconds time elapsed + 2,301,464,636 cycles # 2.819 GHz + 3,602,954,463 instructions # 1.57 insn per cycle + 0.873444960 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.078555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.280307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.280307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051421e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239912e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239912e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.210359 sec +TOTAL : 6.360765 sec INFO: No Floating Point Exceptions have been reported - 17,783,383,753 cycles # 2.862 GHz - 43,511,171,857 instructions # 2.45 insn per cycle - 6.215902116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,243,657,346 cycles # 2.866 GHz + 45,005,960,239 instructions # 2.47 insn per cycle + 6.365748081 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.210434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.377494e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.377494e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.202499e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.355957e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.355957e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.231501 sec +TOTAL : 3.238806 sec INFO: No Floating Point Exceptions have been reported - 9,255,830,965 cycles # 2.863 GHz - 21,906,871,719 instructions # 2.37 insn per cycle - 3.236386372 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 9,281,505,690 cycles # 2.862 GHz + 22,294,520,661 instructions # 2.40 insn per cycle + 3.243835029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.371336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.610412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.610412e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.353660e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.572726e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.572726e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.030129 sec +TOTAL : 3.047299 sec INFO: No Floating Point Exceptions have been reported - 8,294,048,623 cycles # 2.733 GHz - 15,590,527,403 instructions # 1.88 insn per cycle - 3.035436377 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 8,361,633,099 cycles # 2.740 GHz + 15,758,092,056 instructions # 1.88 insn per cycle + 3.052295265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.397036e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.670362e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.670362e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.399950e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.678079e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678079e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.001269 sec +TOTAL : 2.995029 sec INFO: No Floating Point Exceptions have been reported - 8,226,933,402 cycles # 2.737 GHz - 15,430,117,600 instructions # 1.88 insn per cycle - 3.006462525 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 8,228,763,442 cycles # 2.744 GHz + 15,611,452,650 instructions # 1.90 insn per cycle + 3.000008543 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.353627e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576193e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576193e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.369890e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596618e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596618e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.054034 sec +TOTAL : 3.032697 sec INFO: No Floating Point Exceptions have been reported - 6,654,880,184 cycles # 2.176 GHz - 12,863,187,093 instructions # 1.93 insn per cycle - 3.059348788 seconds time elapsed + 6,617,314,348 cycles # 2.179 GHz + 12,864,001,473 instructions # 1.94 insn per cycle + 3.037918435 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 65b53d740f..fd083bf0e0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:26:49 +DATE: 2024-06-03_18:24:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.060712e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.025766e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.025766e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.121418e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.423124e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.423124e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.728605 sec +TOTAL : 1.718861 sec INFO: No Floating Point Exceptions have been reported - 5,559,492,246 cycles # 2.840 GHz - 10,134,973,470 instructions # 1.82 insn per cycle - 2.015457056 seconds time elapsed + 5,539,636,530 cycles # 2.844 GHz + 10,034,793,253 instructions # 1.81 insn per cycle + 2.005948260 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -90,16 +90,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.061003e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.254218e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.254218e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.027970e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.209007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.209007e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.420879 sec +TOTAL : 6.611368 sec INFO: No Floating Point Exceptions have been reported - 18,399,874,886 cycles # 2.863 GHz - 43,656,453,581 instructions # 2.37 insn per cycle - 6.427487997 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,923,826,319 cycles # 2.860 GHz + 45,154,643,508 instructions # 2.39 insn per cycle + 6.617847071 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -119,16 +119,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.114817e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.166060e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.166060e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.092220e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.146266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.146266e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.495712 sec +TOTAL : 3.523089 sec INFO: No Floating Point Exceptions have been reported - 10,015,137,457 cycles # 2.860 GHz - 23,241,753,742 instructions # 2.32 insn per cycle - 3.502264923 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 10,101,089,413 cycles # 2.863 GHz + 23,628,753,394 instructions # 2.34 insn per cycle + 3.529482790 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -148,16 +148,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.231996e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.358012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.358012e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.238833e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.347968e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.347968e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.334222 sec +TOTAL : 3.323626 sec INFO: No Floating Point Exceptions have been reported - 9,138,089,758 cycles # 2.736 GHz - 16,713,258,734 instructions # 1.83 insn per cycle - 3.340848954 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 9,165,072,954 cycles # 2.753 GHz + 16,876,243,244 instructions # 1.84 insn per cycle + 3.330172075 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -177,16 +177,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.270258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.427950e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.427950e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.271402e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.437298e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.437298e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.281372 sec +TOTAL : 3.278898 sec INFO: No Floating Point Exceptions have been reported - 9,005,942,306 cycles # 2.740 GHz - 16,548,921,552 instructions # 1.84 insn per cycle - 3.287811990 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 9,037,907,486 cycles # 2.752 GHz + 16,730,036,801 instructions # 1.85 insn per cycle + 3.285218692 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -206,15 +206,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.240131e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.309974e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.309974e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.251664e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.326129e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.325465 sec +TOTAL : 3.310369 sec INFO: No Floating Point Exceptions have been reported - 7,397,809,520 cycles # 2.221 GHz - 14,072,596,030 instructions # 1.90 insn per cycle - 3.332157901 seconds time elapsed + 7,379,156,811 cycles # 2.226 GHz + 14,070,522,690 instructions # 1.91 insn per cycle + 3.316794525 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 41fcdf2cfe..352483b9b2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:37:54 +DATE: 2024-06-03_18:35:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.389352e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220547e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.248789e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.414064e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187425e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123209e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.212858 sec +TOTAL : 1.215276 sec INFO: No Floating Point Exceptions have been reported - 4,093,911,360 cycles # 2.853 GHz - 6,566,528,457 instructions # 1.60 insn per cycle - 1.491492030 seconds time elapsed + 4,088,679,623 cycles # 2.843 GHz + 6,534,825,098 instructions # 1.60 insn per cycle + 1.494464073 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.083434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.285638e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.285638e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.048831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237717e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.237717e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.532719 sec +TOTAL : 6.724917 sec INFO: No Floating Point Exceptions have been reported - 18,783,324,742 cycles # 2.874 GHz - 43,693,376,231 instructions # 2.33 insn per cycle - 6.537953932 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 19,274,608,418 cycles # 2.865 GHz + 45,191,452,352 instructions # 2.34 insn per cycle + 6.730164945 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.210512e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.382085e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382085e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.187170e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.339667e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.339667e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.582695 sec +TOTAL : 3.608148 sec INFO: No Floating Point Exceptions have been reported - 10,261,732,783 cycles # 2.861 GHz - 21,990,872,924 instructions # 2.14 insn per cycle - 3.587933515 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 10,343,597,433 cycles # 2.863 GHz + 22,378,027,937 instructions # 2.16 insn per cycle + 3.613416876 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.363828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.635295e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.635295e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.332970e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.565979e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565979e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.389987 sec +TOTAL : 3.421134 sec INFO: No Floating Point Exceptions have been reported - 9,345,873,446 cycles # 2.754 GHz - 15,502,334,673 instructions # 1.66 insn per cycle - 3.395365367 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 9,428,567,434 cycles # 2.753 GHz + 15,670,954,000 instructions # 1.66 insn per cycle + 3.426333943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.383815e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.695694e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.695694e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.368735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.670315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.670315e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.376377 sec +TOTAL : 3.387000 sec INFO: No Floating Point Exceptions have been reported - 9,309,965,265 cycles # 2.754 GHz - 15,139,174,417 instructions # 1.63 insn per cycle - 3.381718320 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 9,333,760,322 cycles # 2.752 GHz + 15,321,820,627 instructions # 1.64 insn per cycle + 3.392134873 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.370232e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.594168e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.594168e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.361931e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.581750e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581750e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.394429 sec +TOTAL : 3.402251 sec INFO: No Floating Point Exceptions have been reported - 7,646,568,006 cycles # 2.250 GHz - 12,573,843,987 instructions # 1.64 insn per cycle - 3.399861496 seconds time elapsed + 7,642,447,849 cycles # 2.243 GHz + 12,573,604,096 instructions # 1.65 insn per cycle + 3.407581538 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index cbd445fde8..b6efbb5a3c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:35:09 +DATE: 2024-06-03_18:32:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.400821e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.237112e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.282800e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.442100e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215180e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.122824e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.873575 sec +TOTAL : 0.869353 sec INFO: No Floating Point Exceptions have been reported - 3,103,401,533 cycles # 2.833 GHz - 6,356,049,584 instructions # 2.05 insn per cycle - 1.152258826 seconds time elapsed + 3,095,265,258 cycles # 2.834 GHz + 6,379,863,801 instructions # 2.06 insn per cycle + 1.148344255 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.080565e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.282306e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.282306e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.049503e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.238510e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238510e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.200319 sec +TOTAL : 6.372456 sec INFO: No Floating Point Exceptions have been reported - 17,766,785,300 cycles # 2.864 GHz - 43,507,689,985 instructions # 2.45 insn per cycle - 6.205548449 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,270,854,403 cycles # 2.866 GHz + 45,003,372,558 instructions # 2.46 insn per cycle + 6.377274532 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.219205e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.391133e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.391133e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.202091e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.354764e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.354764e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.222734 sec +TOTAL : 3.241559 sec INFO: No Floating Point Exceptions have been reported - 9,245,511,720 cycles # 2.865 GHz - 21,907,133,008 instructions # 2.37 insn per cycle - 3.228067776 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 9,293,257,475 cycles # 2.863 GHz + 22,294,148,302 instructions # 2.40 insn per cycle + 3.246883723 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.362472e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.627969e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.627969e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.338458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.580425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.580425e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.040708 sec +TOTAL : 3.067414 sec INFO: No Floating Point Exceptions have been reported - 8,344,967,895 cycles # 2.740 GHz - 15,591,025,923 instructions # 1.87 insn per cycle - 3.046295562 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 8,429,082,509 cycles # 2.744 GHz + 15,756,097,527 instructions # 1.87 insn per cycle + 3.072640159 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379116e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.690003e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.690003e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.368856e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.665021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.665021e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.024127 sec +TOTAL : 3.033024 sec INFO: No Floating Point Exceptions have been reported - 8,310,000,031 cycles # 2.744 GHz - 15,436,072,136 instructions # 1.86 insn per cycle - 3.029766146 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 8,320,312,555 cycles # 2.740 GHz + 15,609,926,581 instructions # 1.88 insn per cycle + 3.037988723 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.361982e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.586100e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.586100e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.372805e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.593211e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.593211e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.044604 sec +TOTAL : 3.032379 sec INFO: No Floating Point Exceptions have been reported - 6,641,930,192 cycles # 2.179 GHz - 12,864,124,768 instructions # 1.94 insn per cycle - 3.049948851 seconds time elapsed + 6,617,917,246 cycles # 2.179 GHz + 12,863,669,922 instructions # 1.94 insn per cycle + 3.037700714 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 275da8993d..07585ab3a9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:32:24 +DATE: 2024-06-03_18:29:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,18 +50,18 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.808283e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.206237e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.142753e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.911492e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.182207e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.013995e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.519463 sec +TOTAL : 1.510644 sec INFO: No Floating Point Exceptions have been reported - 4,951,290,576 cycles # 2.843 GHz - 9,146,689,840 instructions # 1.85 insn per cycle - 1.798020957 seconds time elapsed + 4,929,344,895 cycles # 2.845 GHz + 9,108,041,198 instructions # 1.85 insn per cycle + 1.788993156 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -83,16 +83,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.076317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281294e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281294e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.049889e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239017e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.297116 sec +TOTAL : 6.369929 sec INFO: No Floating Point Exceptions have been reported - 18,034,882,237 cycles # 2.862 GHz - 43,508,302,495 instructions # 2.41 insn per cycle - 6.302484278 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,276,108,910 cycles # 2.867 GHz + 45,009,820,973 instructions # 2.46 insn per cycle + 6.375254703 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.210720e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.378151e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.378151e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.203215e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.358762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.358762e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.233511 sec +TOTAL : 3.240583 sec INFO: No Floating Point Exceptions have been reported - 9,271,850,153 cycles # 2.864 GHz - 21,907,043,465 instructions # 2.36 insn per cycle - 3.238909660 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 9,298,780,281 cycles # 2.866 GHz + 22,295,651,617 instructions # 2.40 insn per cycle + 3.245919845 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -139,16 +139,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.352851e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.608306e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.608306e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.327930e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.549134e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.549134e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.052167 sec +TOTAL : 3.079866 sec INFO: No Floating Point Exceptions have been reported - 8,355,651,202 cycles # 2.734 GHz - 15,591,192,622 instructions # 1.87 insn per cycle - 3.057529422 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 8,420,044,894 cycles # 2.731 GHz + 15,757,149,809 instructions # 1.87 insn per cycle + 3.085217641 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379040e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675709e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675709e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.356494e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624833e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624833e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.022810 sec +TOTAL : 3.047415 sec INFO: No Floating Point Exceptions have been reported - 8,287,014,121 cycles # 2.737 GHz - 15,428,840,508 instructions # 1.86 insn per cycle - 3.028205243 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 8,310,845,470 cycles # 2.723 GHz + 15,615,712,930 instructions # 1.88 insn per cycle + 3.052828049 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -195,15 +195,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.364932e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.586134e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.586134e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.366193e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.582701e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582701e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.042115 sec +TOTAL : 3.040896 sec INFO: No Floating Point Exceptions have been reported - 6,626,503,704 cycles # 2.175 GHz - 12,863,711,552 instructions # 1.94 insn per cycle - 3.047484573 seconds time elapsed + 6,624,272,469 cycles # 2.176 GHz + 12,864,098,950 instructions # 1.94 insn per cycle + 3.046289418 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 3a0fd0a90a..f06578ead5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:50:12 +DATE: 2024-06-03_17:56:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.463816e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.296908e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328725e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.179867e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152324e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.182937e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.577578 sec +TOTAL : 0.590804 sec INFO: No Floating Point Exceptions have been reported - 2,276,498,493 cycles # 2.822 GHz - 3,528,632,890 instructions # 1.55 insn per cycle - 0.863625263 seconds time elapsed + 2,308,504,890 cycles # 2.821 GHz + 3,608,810,202 instructions # 1.56 insn per cycle + 0.874918353 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.154961e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.387398e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.387398e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051623e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.240991e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.240991e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.823373 sec +TOTAL : 6.357908 sec INFO: No Floating Point Exceptions have been reported - 16,689,004,614 cycles # 2.865 GHz - 41,263,252,653 instructions # 2.47 insn per cycle - 5.828275571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) + 18,230,546,868 cycles # 2.866 GHz + 44,980,120,800 instructions # 2.47 insn per cycle + 6.363052088 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 410) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.271492e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.527103e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.527103e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.198260e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.343617e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.343617e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.154514 sec +TOTAL : 3.247815 sec INFO: No Floating Point Exceptions have been reported - 9,027,063,562 cycles # 2.858 GHz - 21,210,233,128 instructions # 2.35 insn per cycle - 3.159850522 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) + 9,312,738,759 cycles # 2.864 GHz + 22,262,519,463 instructions # 2.39 insn per cycle + 3.252975989 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.387654e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.648132e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.648132e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.359228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576517e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576517e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.011648 sec +TOTAL : 3.047650 sec INFO: No Floating Point Exceptions have been reported - 8,243,811,729 cycles # 2.736 GHz - 15,422,236,523 instructions # 1.87 insn per cycle - 3.016578474 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) + 8,350,755,476 cycles # 2.736 GHz + 15,749,116,716 instructions # 1.89 insn per cycle + 3.052777331 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2583) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.441553e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.770792e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.770792e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.400758e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.672596e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.672596e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.953944 sec +TOTAL : 2.993253 sec INFO: No Floating Point Exceptions have been reported - 8,107,162,693 cycles # 2.740 GHz - 15,232,791,801 instructions # 1.88 insn per cycle - 2.959262610 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) + 8,214,715,534 cycles # 2.741 GHz + 15,591,753,579 instructions # 1.90 insn per cycle + 2.998240487 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2485) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.373912e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.600116e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.600116e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.375214e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.601309e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601309e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.031130 sec +TOTAL : 3.028196 sec INFO: No Floating Point Exceptions have been reported - 6,600,460,683 cycles # 2.175 GHz - 12,841,921,234 instructions # 1.95 insn per cycle - 3.036614829 seconds time elapsed + 6,596,197,870 cycles # 2.176 GHz + 12,844,320,466 instructions # 1.95 insn per cycle + 3.033260897 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 9c3ce37c8b..53015944b4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:17:34 +DATE: 2024-06-03_18:14:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.329688e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.197353e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.294513e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594191e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.255518e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.111244e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.584260 sec +TOTAL : 0.578532 sec INFO: No Floating Point Exceptions have been reported - 2,290,837,964 cycles # 2.826 GHz - 3,567,143,300 instructions # 1.56 insn per cycle - 0.867513946 seconds time elapsed + 2,276,560,334 cycles # 2.810 GHz + 3,509,958,267 instructions # 1.54 insn per cycle + 0.871311573 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.595326e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079631e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.079631e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.597537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.078888e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.078888e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.323662 sec +TOTAL : 4.314114 sec INFO: No Floating Point Exceptions have been reported - 12,200,237,668 cycles # 2.819 GHz - 32,427,514,864 instructions # 2.66 insn per cycle - 4.329187293 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) + 12,143,989,139 cycles # 2.813 GHz + 32,189,649,214 instructions # 2.65 insn per cycle + 4.319203265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 303) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.615965e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.427412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.427412e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.630595e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.454503e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.454503e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.793932 sec +TOTAL : 2.775394 sec INFO: No Floating Point Exceptions have been reported - 7,996,574,580 cycles # 2.860 GHz - 18,655,165,559 instructions # 2.33 insn per cycle - 2.799028174 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) + 7,955,697,825 cycles # 2.862 GHz + 18,698,571,181 instructions # 2.35 insn per cycle + 2.780527030 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1560) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.687284e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.399964e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.399964e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.690652e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.468642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.468642e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.721615 sec +TOTAL : 2.714160 sec INFO: No Floating Point Exceptions have been reported - 7,454,408,451 cycles # 2.735 GHz - 14,253,415,404 instructions # 1.91 insn per cycle - 2.727046264 seconds time elapsed + 7,473,599,264 cycles # 2.749 GHz + 14,248,936,269 instructions # 1.91 insn per cycle + 2.719148041 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.770821e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.626363e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.626363e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.730968e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.602800e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.602800e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.650953 sec +TOTAL : 2.680982 sec INFO: No Floating Point Exceptions have been reported - 7,318,335,472 cycles # 2.756 GHz - 13,948,037,827 instructions # 1.91 insn per cycle - 2.656532072 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) + 7,364,716,439 cycles # 2.743 GHz + 13,944,217,782 instructions # 1.89 insn per cycle + 2.686027677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2094) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.428325e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.729451e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.729451e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.419153e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.705210e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.705210e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.970494 sec +TOTAL : 2.980985 sec INFO: No Floating Point Exceptions have been reported - 6,503,944,976 cycles # 2.187 GHz - 13,423,073,698 instructions # 2.06 insn per cycle - 2.975897923 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) + 6,519,305,235 cycles # 2.184 GHz + 13,428,800,724 instructions # 2.06 insn per cycle + 2.985939217 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2073) (512y: 1) (512z: 1197) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 76b55ad2e4..78aa8adf25 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:17:58 +DATE: 2024-06-03_18:15:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.324695e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210479e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.333686e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596788e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.293054e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.216099e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.587402 sec +TOTAL : 0.570515 sec INFO: No Floating Point Exceptions have been reported - 2,292,933,061 cycles # 2.822 GHz - 3,530,135,889 instructions # 1.54 insn per cycle - 0.871362762 seconds time elapsed + 2,247,271,854 cycles # 2.826 GHz + 3,528,743,047 instructions # 1.57 insn per cycle + 0.851595191 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.106790e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.056635e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.056635e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.134822e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.092898e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.092898e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.364784 sec +TOTAL : 3.323704 sec INFO: No Floating Point Exceptions have been reported - 9,485,686,184 cycles # 2.815 GHz - 25,263,356,042 instructions # 2.66 insn per cycle - 3.370276038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) + 9,341,713,175 cycles # 2.807 GHz + 25,628,381,739 instructions # 2.74 insn per cycle + 3.328792293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 256) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.961447e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.505102e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.505102e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.935423e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.423634e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.423634e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.509947 sec +TOTAL : 2.529370 sec INFO: No Floating Point Exceptions have been reported - 7,195,839,812 cycles # 2.862 GHz - 16,868,387,762 instructions # 2.34 insn per cycle - 2.515387369 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) + 7,246,753,871 cycles # 2.860 GHz + 16,870,793,706 instructions # 2.33 insn per cycle + 2.534506834 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1362) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.832943e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.808929e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.808929e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.821799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.828846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.828846e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.598583 sec +TOTAL : 2.608573 sec INFO: No Floating Point Exceptions have been reported - 7,142,287,207 cycles # 2.744 GHz - 13,617,950,967 instructions # 1.91 insn per cycle - 2.604113274 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) + 7,171,582,794 cycles # 2.745 GHz + 13,626,734,166 instructions # 1.90 insn per cycle + 2.613649240 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2061) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.905826e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.012392e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.012392e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.881454e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.039760e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.039760e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.542873 sec +TOTAL : 2.559465 sec INFO: No Floating Point Exceptions have been reported - 7,030,524,595 cycles # 2.760 GHz - 13,426,027,213 instructions # 1.91 insn per cycle - 2.548245628 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) + 7,076,560,187 cycles # 2.760 GHz + 13,426,992,808 instructions # 1.90 insn per cycle + 2.564578714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1947) (512y: 4) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.533501e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.984604e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.984604e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.543259e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.003390e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.003390e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.864382 sec +TOTAL : 2.852367 sec INFO: No Floating Point Exceptions have been reported - 6,329,632,912 cycles # 2.206 GHz - 13,154,745,067 instructions # 2.08 insn per cycle - 2.870076647 seconds time elapsed + 6,308,825,768 cycles # 2.209 GHz + 13,154,958,113 instructions # 2.09 insn per cycle + 2.857417301 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 30bc197182..18a6685695 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:50:38 +DATE: 2024-06-03_17:57:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.732879e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.705839e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157350e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.318290e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.683416e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.140319e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.682384 sec +TOTAL : 0.694822 sec INFO: No Floating Point Exceptions have been reported - 2,597,825,046 cycles # 2.830 GHz - 4,090,205,188 instructions # 1.57 insn per cycle - 0.977759023 seconds time elapsed + 2,632,787,789 cycles # 2.825 GHz + 4,125,301,456 instructions # 1.57 insn per cycle + 0.988279407 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.031288e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.206220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.206220e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.933181e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153811e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.153811e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.522879 sec +TOTAL : 6.752917 sec INFO: No Floating Point Exceptions have been reported - 18,693,180,609 cycles # 2.864 GHz - 44,222,141,009 instructions # 2.37 insn per cycle - 6.528296312 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) + 19,322,413,781 cycles # 2.860 GHz + 46,295,992,950 instructions # 2.40 insn per cycle + 6.758378943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 479) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.613969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.133791e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.133791e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594060e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.096178e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096178e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.323747 sec +TOTAL : 4.370340 sec INFO: No Floating Point Exceptions have been reported - 12,389,439,717 cycles # 2.863 GHz - 30,920,154,197 instructions # 2.50 insn per cycle - 4.329261731 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) + 12,539,997,225 cycles # 2.867 GHz + 31,478,010,346 instructions # 2.51 insn per cycle + 4.375575128 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1732) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.921394e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.668547e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.668547e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.919152e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.671850e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.671850e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.698108 sec +TOTAL : 3.697636 sec INFO: No Floating Point Exceptions have been reported - 10,087,612,198 cycles # 2.725 GHz - 19,373,367,445 instructions # 1.92 insn per cycle - 3.703569148 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) + 10,080,701,963 cycles # 2.723 GHz + 19,468,552,192 instructions # 1.93 insn per cycle + 3.702944091 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2133) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.984824e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.800128e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.800128e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.949458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.715469e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.715469e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.590549 sec +TOTAL : 3.644169 sec INFO: No Floating Point Exceptions have been reported - 9,780,533,240 cycles # 2.721 GHz - 18,954,616,108 instructions # 1.94 insn per cycle - 3.596015305 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) + 9,924,266,504 cycles # 2.720 GHz + 19,219,972,572 instructions # 1.94 insn per cycle + 3.649461576 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1874) (512y: 189) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.719226e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.286801e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.286801e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.736545e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.311368e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.311368e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.082921 sec +TOTAL : 4.043422 sec INFO: No Floating Point Exceptions have been reported - 8,420,905,067 cycles # 2.060 GHz - 15,057,078,071 instructions # 1.79 insn per cycle - 4.088371462 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) + 8,353,044,219 cycles # 2.064 GHz + 15,065,800,381 instructions # 1.80 insn per cycle + 4.048753454 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1028) (512y: 154) (512z: 1321) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 05d5e2d3d7..1519beb165 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:51:09 +DATE: 2024-06-03_17:57:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.579969e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.686020e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157503e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.355716e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.694815e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.156755e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.687280 sec +TOTAL : 0.692525 sec INFO: No Floating Point Exceptions have been reported - 2,618,766,619 cycles # 2.826 GHz - 4,041,753,204 instructions # 1.54 insn per cycle - 0.983494772 seconds time elapsed + 2,622,244,403 cycles # 2.823 GHz + 4,091,592,081 instructions # 1.56 insn per cycle + 0.985574224 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.075110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.266502e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266502e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.849035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.144096e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144096e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.273174 sec +TOTAL : 6.811046 sec INFO: No Floating Point Exceptions have been reported - 17,976,200,983 cycles # 2.863 GHz - 42,467,527,484 instructions # 2.36 insn per cycle - 6.278790336 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) + 19,367,363,181 cycles # 2.842 GHz + 46,233,493,704 instructions # 2.39 insn per cycle + 6.816378070 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.651801e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.199325e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.199325e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.582631e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079535e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079535e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.233700 sec +TOTAL : 4.399702 sec INFO: No Floating Point Exceptions have been reported - 12,135,466,974 cycles # 2.863 GHz - 30,227,050,455 instructions # 2.49 insn per cycle - 4.239313548 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) + 12,583,865,657 cycles # 2.857 GHz + 31,452,363,261 instructions # 2.50 insn per cycle + 4.405099309 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1724) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.928405e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.679872e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.679872e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.918861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.663659e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.663659e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.683536 sec +TOTAL : 3.700838 sec INFO: No Floating Point Exceptions have been reported - 10,048,331,425 cycles # 2.724 GHz - 19,255,994,226 instructions # 1.92 insn per cycle - 3.689022984 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) + 10,078,917,373 cycles # 2.720 GHz + 19,455,390,747 instructions # 1.93 insn per cycle + 3.706301399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2117) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.015749e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.854641e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.854641e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951655e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.728573e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.728573e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.541051 sec +TOTAL : 3.642043 sec INFO: No Floating Point Exceptions have been reported - 9,640,245,530 cycles # 2.719 GHz - 18,744,573,817 instructions # 1.94 insn per cycle - 3.546545299 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) + 9,923,186,488 cycles # 2.721 GHz + 19,285,155,635 instructions # 1.94 insn per cycle + 3.647395181 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1868) (512y: 189) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.763865e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.360552e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.360552e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.767522e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.368955e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.368955e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.989990 sec +TOTAL : 3.981046 sec INFO: No Floating Point Exceptions have been reported - 8,258,527,369 cycles # 2.068 GHz - 14,978,587,265 instructions # 1.81 insn per cycle - 3.995476429 seconds time elapsed + 8,244,325,807 cycles # 2.069 GHz + 14,979,516,169 instructions # 1.82 insn per cycle + 3.986442883 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index e6ca4b3727..330c65bf94 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:51:39 +DATE: 2024-06-03_17:58:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.571906e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.165675e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278273e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.507794e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163633e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275901e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.533824 sec +TOTAL : 0.531152 sec INFO: No Floating Point Exceptions have been reported - 2,160,338,708 cycles # 2.816 GHz - 3,108,947,549 instructions # 1.44 insn per cycle - 0.826788991 seconds time elapsed + 2,172,514,009 cycles # 2.825 GHz + 3,097,319,785 instructions # 1.43 insn per cycle + 0.826126707 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.052422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.113200e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.113200e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.780038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.825479e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.825479e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.219551 sec +TOTAL : 6.000160 sec INFO: No Floating Point Exceptions have been reported - 14,990,786,657 cycles # 2.870 GHz - 38,373,509,892 instructions # 2.56 insn per cycle - 5.225136790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,224,093,003 cycles # 2.869 GHz + 45,944,443,660 instructions # 2.67 insn per cycle + 6.005443800 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.420355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.607688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.607688e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.108544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.261195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.261195e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.176496 sec +TOTAL : 3.482499 sec INFO: No Floating Point Exceptions have been reported - 9,107,869,375 cycles # 2.863 GHz - 24,577,368,445 instructions # 2.70 insn per cycle - 3.182042391 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,010,591,709 cycles # 2.871 GHz + 27,842,727,910 instructions # 2.78 insn per cycle + 3.487798049 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.505330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.986443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.986443e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.875219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.247933e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.247933e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.015782 sec +TOTAL : 2.261397 sec INFO: No Floating Point Exceptions have been reported - 5,458,675,505 cycles # 2.701 GHz - 11,252,130,547 instructions # 2.06 insn per cycle - 2.021434813 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,075,167,839 cycles # 2.681 GHz + 12,586,731,458 instructions # 2.07 insn per cycle + 2.266904739 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.084191e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.669186e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.669186e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.356278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.809616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.809616e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.833429 sec +TOTAL : 2.067294 sec INFO: No Floating Point Exceptions have been reported - 4,938,916,416 cycles # 2.687 GHz - 10,556,489,069 instructions # 2.14 insn per cycle - 1.839288416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,563,695,224 cycles # 2.686 GHz + 12,021,605,529 instructions # 2.16 insn per cycle + 2.072698796 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.610931e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.807983e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.807983e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.399286e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.574109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.574109e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.014130 sec +TOTAL : 3.194495 sec INFO: No Floating Point Exceptions have been reported - 5,379,542,844 cycles # 1.782 GHz - 7,793,225,348 instructions # 1.45 insn per cycle - 3.019714352 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,687,155,664 cycles # 1.778 GHz + 8,297,084,050 instructions # 1.46 insn per cycle + 3.199814441 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 1fa6968bab..8d1d94459f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:27:19 +DATE: 2024-06-03_18:24:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.462166e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.205101e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.205101e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.531777e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238320e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238320e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.820247 sec +TOTAL : 0.815404 sec INFO: No Floating Point Exceptions have been reported - 3,038,196,334 cycles # 2.828 GHz - 4,716,972,261 instructions # 1.55 insn per cycle - 1.131805276 seconds time elapsed + 2,992,682,585 cycles # 2.828 GHz + 4,685,692,827 instructions # 1.57 insn per cycle + 1.115617940 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,16 +90,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.034127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.094165e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.094165e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.765471e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.810865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.810865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.347576 sec +TOTAL : 6.134495 sec INFO: No Floating Point Exceptions have been reported - 15,332,963,137 cycles # 2.864 GHz - 38,433,385,565 instructions # 2.51 insn per cycle - 5.354839876 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,602,475,077 cycles # 2.867 GHz + 46,002,972,709 instructions # 2.61 insn per cycle + 6.141458193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -107,8 +107,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -119,16 +119,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.391806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576311e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576311e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.072742e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.223579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.223579e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.286080 sec +TOTAL : 3.608235 sec INFO: No Floating Point Exceptions have been reported - 9,426,586,757 cycles # 2.864 GHz - 24,763,935,790 instructions # 2.63 insn per cycle - 3.293082254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,371,923,734 cycles # 2.870 GHz + 28,025,852,686 instructions # 2.70 insn per cycle + 3.615212944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -148,16 +148,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.330091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.783188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.783188e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.822050e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.191359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.191359e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.166949 sec +TOTAL : 2.372316 sec INFO: No Floating Point Exceptions have been reported - 5,830,307,091 cycles # 2.683 GHz - 11,537,845,857 instructions # 1.98 insn per cycle - 2.174147046 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,428,545,735 cycles # 2.703 GHz + 12,872,354,230 instructions # 2.00 insn per cycle + 2.379155799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -177,16 +177,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.935200e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.505542e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.505542e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.264619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.698712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.698712e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.966012 sec +TOTAL : 2.186815 sec INFO: No Floating Point Exceptions have been reported - 5,308,540,347 cycles # 2.692 GHz - 10,845,350,411 instructions # 2.04 insn per cycle - 1.973191716 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,919,973,980 cycles # 2.700 GHz + 12,311,338,626 instructions # 2.08 insn per cycle + 2.193659045 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -206,16 +206,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.557121e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.748967e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.748967e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.354371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.543902e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.543902e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.148015 sec +TOTAL : 3.326227 sec INFO: No Floating Point Exceptions have been reported - 5,759,134,449 cycles # 1.827 GHz - 8,037,556,808 instructions # 1.40 insn per cycle - 3.155199268 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 6,132,874,621 cycles # 1.841 GHz + 8,544,916,384 instructions # 1.39 insn per cycle + 3.333259222 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 564b56aaa2..965664b535 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:38:23 +DATE: 2024-06-03_18:35:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.834225e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.174853e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276595e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.935211e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.173304e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276568e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.628184 sec +TOTAL : 0.629953 sec INFO: No Floating Point Exceptions have been reported - 2,420,276,537 cycles # 2.823 GHz - 3,537,411,505 instructions # 1.46 insn per cycle - 0.914703116 seconds time elapsed + 2,425,033,040 cycles # 2.821 GHz + 3,533,427,129 instructions # 1.46 insn per cycle + 0.916564365 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.042150e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.102920e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.102920e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.776195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.821804e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821804e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.309711 sec +TOTAL : 6.077661 sec INFO: No Floating Point Exceptions have been reported - 15,208,601,758 cycles # 2.862 GHz - 38,393,755,680 instructions # 2.52 insn per cycle - 5.315290014 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,427,004,730 cycles # 2.865 GHz + 45,959,309,920 instructions # 2.64 insn per cycle + 6.083278927 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.440689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.630223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.630223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.101458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.257493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.257493e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.221641 sec +TOTAL : 3.555023 sec INFO: No Floating Point Exceptions have been reported - 9,237,771,832 cycles # 2.863 GHz - 24,577,605,010 instructions # 2.66 insn per cycle - 3.227257120 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,225,297,905 cycles # 2.873 GHz + 27,842,100,711 instructions # 2.72 insn per cycle + 3.560578065 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.426554e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.901258e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.901258e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.882594e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.257869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.257869e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.107966 sec +TOTAL : 2.322439 sec INFO: No Floating Point Exceptions have been reported - 5,642,384,352 cycles # 2.670 GHz - 11,234,139,166 instructions # 1.99 insn per cycle - 2.113587927 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,291,312,792 cycles # 2.703 GHz + 12,568,769,792 instructions # 2.00 insn per cycle + 2.328056676 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.038246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.623434e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.623434e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.346787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.802546e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.802546e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.913383 sec +TOTAL : 2.136932 sec INFO: No Floating Point Exceptions have been reported - 5,136,905,922 cycles # 2.679 GHz - 10,506,331,510 instructions # 2.05 insn per cycle - 1.919115165 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,796,366,053 cycles # 2.706 GHz + 11,970,782,633 instructions # 2.07 insn per cycle + 2.142516142 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.577150e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.773033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.773033e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.432702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.609165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.609165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.107472 sec +TOTAL : 3.229295 sec INFO: No Floating Point Exceptions have been reported - 5,603,483,278 cycles # 1.801 GHz - 7,744,927,992 instructions # 1.38 insn per cycle - 3.113151850 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,897,294,349 cycles # 1.824 GHz + 8,246,732,764 instructions # 1.40 insn per cycle + 3.234708724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 1ac31e13f6..1740584361 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:35:36 +DATE: 2024-06-03_18:32:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.828351e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175882e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277851e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.946226e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175204e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276988e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.567256 sec +TOTAL : 0.566359 sec INFO: No Floating Point Exceptions have been reported - 2,253,776,484 cycles # 2.822 GHz - 3,518,982,258 instructions # 1.56 insn per cycle - 0.855545893 seconds time elapsed + 2,244,660,155 cycles # 2.822 GHz + 3,509,815,718 instructions # 1.56 insn per cycle + 0.854244701 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.056356e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.117975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.117975e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.777744e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.824118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.824118e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.211101 sec +TOTAL : 6.010337 sec INFO: No Floating Point Exceptions have been reported - 15,016,692,408 cycles # 2.879 GHz - 38,373,187,740 instructions # 2.56 insn per cycle - 5.216643799 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,238,214,430 cycles # 2.866 GHz + 45,943,155,206 instructions # 2.67 insn per cycle + 6.015955555 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.467027e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.660047e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.660047e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.100277e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.254705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254705e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.136723 sec +TOTAL : 3.494866 sec INFO: No Floating Point Exceptions have been reported - 9,074,004,106 cycles # 2.889 GHz - 24,577,979,212 instructions # 2.71 insn per cycle - 3.142372869 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,035,630,342 cycles # 2.868 GHz + 27,842,669,439 instructions # 2.77 insn per cycle + 3.500612923 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.443155e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.918966e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.918966e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.870990e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.243848e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.243848e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.039960 sec +TOTAL : 2.264831 sec INFO: No Floating Point Exceptions have been reported - 5,475,053,241 cycles # 2.677 GHz - 11,251,295,295 instructions # 2.06 insn per cycle - 2.045540905 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,092,216,208 cycles # 2.684 GHz + 12,585,707,297 instructions # 2.07 insn per cycle + 2.270557428 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.050564e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.643604e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.643604e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.353191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.806875e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.806875e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.847523 sec +TOTAL : 2.072141 sec INFO: No Floating Point Exceptions have been reported - 4,963,669,399 cycles # 2.679 GHz - 10,556,626,951 instructions # 2.13 insn per cycle - 1.853394046 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,581,758,113 cycles # 2.688 GHz + 12,021,560,907 instructions # 2.15 insn per cycle + 2.077787033 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.592373e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.788368e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.788368e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.372806e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.548133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548133e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.031855 sec +TOTAL : 3.221900 sec INFO: No Floating Point Exceptions have been reported - 5,406,083,686 cycles # 1.780 GHz - 7,793,724,258 instructions # 1.44 insn per cycle - 3.037456860 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,733,219,688 cycles # 1.777 GHz + 8,300,008,708 instructions # 1.45 insn per cycle + 3.227549517 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 5a92d6747d..423530fe15 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:32:52 +DATE: 2024-06-03_18:30:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.752555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167872e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274609e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.772233e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.173217e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274698e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.720859 sec +TOTAL : 0.721656 sec INFO: No Floating Point Exceptions have been reported - 2,687,445,313 cycles # 2.827 GHz - 4,266,641,488 instructions # 1.59 insn per cycle - 1.008523478 seconds time elapsed + 2,682,709,722 cycles # 2.825 GHz + 4,257,255,296 instructions # 1.59 insn per cycle + 1.008651062 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -83,16 +83,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.046564e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.107338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.107338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.781162e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.827105e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827105e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.235890 sec +TOTAL : 5.999645 sec INFO: No Floating Point Exceptions have been reported - 15,009,426,379 cycles # 2.865 GHz - 38,373,420,682 instructions # 2.56 insn per cycle - 5.241597173 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,238,271,411 cycles # 2.871 GHz + 45,943,018,029 instructions # 2.67 insn per cycle + 6.005361351 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.437646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.628123e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.628123e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.100108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.254803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254803e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.163337 sec +TOTAL : 3.494980 sec INFO: No Floating Point Exceptions have been reported - 9,075,620,291 cycles # 2.865 GHz - 24,578,067,979 instructions # 2.71 insn per cycle - 3.168963563 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,040,303,278 cycles # 2.869 GHz + 27,845,061,153 instructions # 2.77 insn per cycle + 3.500633160 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -139,16 +139,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.418817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.893074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.893074e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.873993e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.251786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.251786e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.047725 sec +TOTAL : 2.264207 sec INFO: No Floating Point Exceptions have been reported - 5,485,987,071 cycles # 2.673 GHz - 11,251,055,584 instructions # 2.05 insn per cycle - 2.053325818 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,092,809,767 cycles # 2.685 GHz + 12,585,659,630 instructions # 2.07 insn per cycle + 2.269920215 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.062072e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.650490e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.650490e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.354382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.806589e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.806589e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.841979 sec +TOTAL : 2.070043 sec INFO: No Floating Point Exceptions have been reported - 4,952,856,258 cycles # 2.682 GHz - 10,558,518,877 instructions # 2.13 insn per cycle - 1.847644905 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,575,902,651 cycles # 2.688 GHz + 12,022,417,349 instructions # 2.16 insn per cycle + 2.075736025 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -195,16 +195,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.597246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.793746e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.793746e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.385452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.559039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.559039e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.028351 sec +TOTAL : 3.210851 sec INFO: No Floating Point Exceptions have been reported - 5,400,904,302 cycles # 1.781 GHz - 7,793,425,655 instructions # 1.44 insn per cycle - 3.034012590 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,716,187,306 cycles # 1.778 GHz + 8,297,674,838 instructions # 1.45 insn per cycle + 3.216588828 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 58e2659367..8044c64e2b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:52:03 +DATE: 2024-06-03_17:58:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.589601e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168593e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279897e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.505313e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168076e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279932e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529965 sec +TOTAL : 0.529980 sec INFO: No Floating Point Exceptions have been reported - 2,176,096,086 cycles # 2.821 GHz - 3,116,651,489 instructions # 1.43 insn per cycle - 0.828693316 seconds time elapsed + 2,177,992,433 cycles # 2.826 GHz + 3,144,999,861 instructions # 1.44 insn per cycle + 0.827728090 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.046807e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.107510e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.107510e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.835420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.883637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883637e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.232787 sec +TOTAL : 5.821491 sec INFO: No Floating Point Exceptions have been reported - 15,011,109,733 cycles # 2.866 GHz - 40,100,143,330 instructions # 2.67 insn per cycle - 5.238308472 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) + 16,709,182,080 cycles # 2.868 GHz + 44,935,035,746 instructions # 2.69 insn per cycle + 5.826794315 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 581) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.594471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.800449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.800449e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.254078e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.422040e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422040e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.028090 sec +TOTAL : 3.332001 sec INFO: No Floating Point Exceptions have been reported - 8,685,388,720 cycles # 2.864 GHz - 23,672,029,686 instructions # 2.73 insn per cycle - 3.033712399 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) + 9,562,607,479 cycles # 2.866 GHz + 26,700,619,348 instructions # 2.79 insn per cycle + 3.337351491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2344) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.870985e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.244397e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.244397e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.461877e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.772979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.772979e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.263752 sec +TOTAL : 2.460573 sec INFO: No Floating Point Exceptions have been reported - 6,080,549,535 cycles # 2.681 GHz - 13,060,990,924 instructions # 2.15 insn per cycle - 2.269275292 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) + 6,596,657,745 cycles # 2.676 GHz + 14,125,089,780 instructions # 2.14 insn per cycle + 2.466109830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2786) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.126541e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.542200e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.542200e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.654021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.990833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.990833e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.155718 sec +TOTAL : 2.362948 sec INFO: No Floating Point Exceptions have been reported - 5,801,329,189 cycles # 2.685 GHz - 12,321,707,264 instructions # 2.12 insn per cycle - 2.161291942 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) + 6,346,647,925 cycles # 2.681 GHz + 13,710,696,355 instructions # 2.16 insn per cycle + 2.368361225 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2437) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.300125e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.464144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.464144e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.256087e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.415473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415473e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.288245 sec +TOTAL : 3.330240 sec INFO: No Floating Point Exceptions have been reported - 5,828,079,793 cycles # 1.770 GHz - 9,603,396,173 instructions # 1.65 insn per cycle - 3.293946839 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) + 5,906,442,578 cycles # 1.771 GHz + 10,065,102,749 instructions # 1.70 insn per cycle + 3.335608318 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1291) (512y: 208) (512z: 1987) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index eacee14a97..73cacda685 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:18:20 +DATE: 2024-06-03_18:15:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.680825e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167882e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277071e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.200564e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181073e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276469e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.530821 sec +TOTAL : 0.523685 sec INFO: No Floating Point Exceptions have been reported - 2,180,475,973 cycles # 2.822 GHz - 3,129,539,684 instructions # 1.44 insn per cycle - 0.829975432 seconds time elapsed + 2,132,146,236 cycles # 2.820 GHz + 3,089,550,200 instructions # 1.45 insn per cycle + 0.812891889 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.369428e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.450686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.450686e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.369000e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.450802e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.450802e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.536778 sec +TOTAL : 4.537740 sec INFO: No Floating Point Exceptions have been reported - 13,015,636,000 cycles # 2.866 GHz - 34,387,703,055 instructions # 2.64 insn per cycle - 4.542541003 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) + 13,022,402,458 cycles # 2.867 GHz + 34,354,727,792 instructions # 2.64 insn per cycle + 4.543265370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.924099e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.059855e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.059855e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.903350e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.037707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.037707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.697505 sec +TOTAL : 3.724484 sec INFO: No Floating Point Exceptions have been reported - 10,607,346,172 cycles # 2.865 GHz - 24,007,082,338 instructions # 2.26 insn per cycle - 3.703200013 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) + 10,686,935,794 cycles # 2.866 GHz + 24,007,583,642 instructions # 2.25 insn per cycle + 3.730002429 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.415164e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.721166e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.721166e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.455535e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.769238e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.769238e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.486573 sec +TOTAL : 2.467399 sec INFO: No Floating Point Exceptions have been reported - 6,676,542,764 cycles # 2.680 GHz - 12,401,383,261 instructions # 1.86 insn per cycle - 2.492408166 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) + 6,614,195,369 cycles # 2.676 GHz + 12,348,253,571 instructions # 1.87 insn per cycle + 2.472798844 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3121) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.719363e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.070259e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.070259e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.784525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.146203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.146203e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.333222 sec +TOTAL : 2.304448 sec INFO: No Floating Point Exceptions have been reported - 6,249,988,604 cycles # 2.673 GHz - 11,572,934,567 instructions # 1.85 insn per cycle - 2.339105101 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) + 6,172,989,918 cycles # 2.673 GHz + 11,569,702,810 instructions # 1.87 insn per cycle + 2.309924380 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2671) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.635891e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.836056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.836056e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.608551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806496e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806496e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.994997 sec +TOTAL : 3.017997 sec INFO: No Floating Point Exceptions have been reported - 5,329,559,296 cycles # 1.777 GHz - 9,295,784,708 instructions # 1.74 insn per cycle - 3.000603709 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) + 5,387,022,252 cycles # 1.782 GHz + 9,287,765,087 instructions # 1.72 insn per cycle + 3.023735174 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2114) (512y: 282) (512z: 1954) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 2a7449ccf8..4a353d3560 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:18:45 +DATE: 2024-06-03_18:15:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.690021e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.170643e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281113e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.200706e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184031e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279511e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529016 sec +TOTAL : 0.522181 sec INFO: No Floating Point Exceptions have been reported - 2,181,786,873 cycles # 2.820 GHz - 3,123,798,739 instructions # 1.43 insn per cycle - 0.830314726 seconds time elapsed + 2,138,289,162 cycles # 2.834 GHz + 3,096,034,787 instructions # 1.45 insn per cycle + 0.811805326 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.498656e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.589488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.589488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.513254e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.605489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.605489e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.307566 sec +TOTAL : 4.283257 sec INFO: No Floating Point Exceptions have been reported - 12,355,577,175 cycles # 2.865 GHz - 35,037,181,929 instructions # 2.84 insn per cycle - 4.313266681 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) + 12,289,516,686 cycles # 2.866 GHz + 34,923,121,667 instructions # 2.84 insn per cycle + 4.288584491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.900568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.034514e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.034514e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.900102e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.034022e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.034022e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.726420 sec +TOTAL : 3.727942 sec INFO: No Floating Point Exceptions have been reported - 10,682,800,271 cycles # 2.863 GHz - 23,083,133,822 instructions # 2.16 insn per cycle - 3.732131064 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) + 10,702,802,356 cycles # 2.868 GHz + 23,010,990,691 instructions # 2.15 insn per cycle + 3.733644631 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2349) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.781411e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.142736e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.142736e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.700701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.048209e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.048209e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.304487 sec +TOTAL : 2.342453 sec INFO: No Floating Point Exceptions have been reported - 6,156,370,918 cycles # 2.666 GHz - 11,956,053,429 instructions # 1.94 insn per cycle - 2.310098952 seconds time elapsed + 6,283,660,596 cycles # 2.677 GHz + 11,956,874,467 instructions # 1.90 insn per cycle + 2.347962428 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.888437e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.261805e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.261805e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.892088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.267236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.267236e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.256063 sec +TOTAL : 2.256039 sec INFO: No Floating Point Exceptions have been reported - 6,010,669,476 cycles # 2.659 GHz - 11,128,968,945 instructions # 1.85 insn per cycle - 2.261765055 seconds time elapsed + 6,045,328,125 cycles # 2.674 GHz + 11,131,038,815 instructions # 1.84 insn per cycle + 2.261476465 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.710789e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.919169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.919169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.747196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.960337e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.960337e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.936689 sec +TOTAL : 2.910095 sec INFO: No Floating Point Exceptions have been reported - 5,226,987,569 cycles # 1.777 GHz - 9,022,159,593 instructions # 1.73 insn per cycle - 2.942633203 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) + 5,193,848,580 cycles # 1.782 GHz + 9,026,478,285 instructions # 1.74 insn per cycle + 2.915440965 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1570) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 109477ba28..3358c1fcdf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:52:27 +DATE: 2024-06-03_17:59:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.016847e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.697629e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.981893e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.482617 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.189397e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.170319e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.387519e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.487127 sec INFO: No Floating Point Exceptions have been reported - 1,998,642,469 cycles # 2.818 GHz - 2,880,747,261 instructions # 1.44 insn per cycle - 0.766260624 seconds time elapsed + 2,013,212,019 cycles # 2.819 GHz + 2,909,811,134 instructions # 1.45 insn per cycle + 0.770281917 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.189515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.261171e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.261171e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.885414e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.938356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.938356e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.877134 sec +TOTAL : 5.649040 sec INFO: No Floating Point Exceptions have been reported - 13,994,891,793 cycles # 2.867 GHz - 38,340,768,488 instructions # 2.74 insn per cycle - 4.882458026 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,220,933,838 cycles # 2.870 GHz + 45,344,072,693 instructions # 2.80 insn per cycle + 5.654165812 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.868397e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.268797e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.268797e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.438186e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.768939e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.768939e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.243083 sec +TOTAL : 2.450780 sec INFO: No Floating Point Exceptions have been reported - 6,441,514,611 cycles # 2.866 GHz - 15,815,172,638 instructions # 2.46 insn per cycle - 2.248472682 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,039,664,256 cycles # 2.868 GHz + 17,774,972,317 instructions # 2.52 insn per cycle + 2.455771294 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.776329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.004987e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.004987e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.082016e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.154135e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.154135e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.283215 sec +TOTAL : 1.386052 sec INFO: No Floating Point Exceptions have been reported - 3,465,053,995 cycles # 2.691 GHz - 7,593,444,901 instructions # 2.19 insn per cycle - 1.288569725 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,733,426,497 cycles # 2.685 GHz + 8,265,424,322 instructions # 2.21 insn per cycle + 1.391191714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.465176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096622e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096622e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.446730e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.624818e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.624818e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.196628 sec +TOTAL : 1.329069 sec INFO: No Floating Point Exceptions have been reported - 3,245,802,002 cycles # 2.702 GHz - 7,203,049,725 instructions # 2.22 insn per cycle - 1.202115916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,545,954,783 cycles # 2.659 GHz + 7,920,530,780 instructions # 2.23 insn per cycle + 1.334352866 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.681015e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.390488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.390488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.261718e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.888190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.888190e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.658088 sec +TOTAL : 1.762412 sec INFO: No Floating Point Exceptions have been reported - 3,068,008,054 cycles # 1.846 GHz - 5,834,677,054 instructions # 1.90 insn per cycle - 1.663441620 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,252,410,609 cycles # 1.841 GHz + 6,100,423,263 instructions # 1.88 insn per cycle + 1.767662641 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index ecf1f25eca..97dec5a86e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:27:43 +DATE: 2024-06-03_18:24:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.942422e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.805237e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.805237e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.981473e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.818775e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.818775e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.682307 sec +TOTAL : 0.680933 sec INFO: No Floating Point Exceptions have been reported - 2,573,557,720 cycles # 2.826 GHz - 4,026,704,050 instructions # 1.56 insn per cycle - 0.968234758 seconds time elapsed + 2,573,133,936 cycles # 2.826 GHz + 4,034,036,104 instructions # 1.57 insn per cycle + 0.968043602 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -76,8 +76,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -90,16 +90,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.164148e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.234526e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.234526e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.876784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929482e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.979751 sec +TOTAL : 5.721392 sec INFO: No Floating Point Exceptions have been reported - 14,265,943,518 cycles # 2.864 GHz - 38,385,772,523 instructions # 2.69 insn per cycle - 4.986458560 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,410,500,959 cycles # 2.866 GHz + 45,384,280,796 instructions # 2.77 insn per cycle + 5.727881558 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -107,8 +107,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -119,16 +119,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.818621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.210781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.210781e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.387912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.711236e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.313915 sec +TOTAL : 2.528580 sec INFO: No Floating Point Exceptions have been reported - 6,643,916,276 cycles # 2.864 GHz - 16,095,500,762 instructions # 2.42 insn per cycle - 2.320575344 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,256,716,109 cycles # 2.864 GHz + 18,057,230,835 instructions # 2.49 insn per cycle + 2.535013143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -136,8 +136,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -148,16 +148,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.616473e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.887402e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.887402e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.916935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.978265e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.978265e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.355624 sec +TOTAL : 1.463017 sec INFO: No Floating Point Exceptions have been reported - 3,674,656,527 cycles # 2.699 GHz - 7,830,907,550 instructions # 2.13 insn per cycle - 1.362279184 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,946,830,503 cycles # 2.687 GHz + 8,502,419,899 instructions # 2.15 insn per cycle + 1.469468319 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,8 +165,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -177,16 +177,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.223119e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.069627e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.069627e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.391597e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.599398e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.599398e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.276248 sec +TOTAL : 1.387422 sec INFO: No Floating Point Exceptions have been reported - 3,470,678,953 cycles # 2.706 GHz - 7,438,963,141 instructions # 2.14 insn per cycle - 1.283084734 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,764,832,151 cycles # 2.703 GHz + 8,158,322,672 instructions # 2.17 insn per cycle + 1.393974805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -194,8 +194,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -206,16 +206,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.605496e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.298191e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.298191e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.166687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.764799e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.764799e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.726954 sec +TOTAL : 1.839563 sec INFO: No Floating Point Exceptions have been reported - 3,271,998,736 cycles # 1.889 GHz - 6,089,399,163 instructions # 1.86 insn per cycle - 1.733677408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,463,042,965 cycles # 1.877 GHz + 6,355,050,636 instructions # 1.84 insn per cycle + 1.845923010 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -223,8 +223,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 0ad3eafec8..b054003c65 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:38:47 +DATE: 2024-06-03_18:36:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.959750e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.678565e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.983275e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.193088e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.180658e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.376626e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.579074 sec +TOTAL : 0.579235 sec INFO: No Floating Point Exceptions have been reported - 2,257,408,465 cycles # 2.820 GHz - 3,319,397,953 instructions # 1.47 insn per cycle - 0.858489340 seconds time elapsed + 2,259,058,959 cycles # 2.822 GHz + 3,332,330,914 instructions # 1.48 insn per cycle + 0.858238193 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.184974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.256690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.256690e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.881822e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.935460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935460e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.945704 sec +TOTAL : 5.718812 sec INFO: No Floating Point Exceptions have been reported - 14,170,625,155 cycles # 2.863 GHz - 38,370,527,150 instructions # 2.71 insn per cycle - 4.951076630 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,400,409,861 cycles # 2.866 GHz + 45,373,617,800 instructions # 2.77 insn per cycle + 5.724065508 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.860658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.261752e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.261752e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.427518e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.757072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.757072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.305703 sec +TOTAL : 2.515997 sec INFO: No Floating Point Exceptions have been reported - 6,613,265,265 cycles # 2.862 GHz - 15,828,283,667 instructions # 2.39 insn per cycle - 2.311146715 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,219,245,576 cycles # 2.864 GHz + 17,789,385,116 instructions # 2.46 insn per cycle + 2.521192317 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.686618e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.970674e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.970674e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.058523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.147643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.147643e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.355317 sec +TOTAL : 1.448735 sec INFO: No Floating Point Exceptions have been reported - 3,633,491,212 cycles # 2.672 GHz - 7,578,117,090 instructions # 2.09 insn per cycle - 1.360829033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,909,707,878 cycles # 2.690 GHz + 8,249,725,200 instructions # 2.11 insn per cycle + 1.454021088 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.321381e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.082734e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.082734e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.543553e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.792551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.792551e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.273623 sec +TOTAL : 1.375214 sec INFO: No Floating Point Exceptions have been reported - 3,431,066,713 cycles # 2.684 GHz - 7,153,252,647 instructions # 2.08 insn per cycle - 1.278962900 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,723,811,727 cycles # 2.699 GHz + 7,870,129,072 instructions # 2.11 insn per cycle + 1.380400518 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.705447e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.424288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.424288e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.254541e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.873169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.873169e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.712725 sec +TOTAL : 1.824987 sec INFO: No Floating Point Exceptions have been reported - 3,232,442,354 cycles # 1.882 GHz - 5,785,846,161 instructions # 1.79 insn per cycle - 1.718151231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,423,559,926 cycles # 1.872 GHz + 6,050,752,549 instructions # 1.77 insn per cycle + 1.830318343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 4e4b68c02e..c835253773 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:35:59 +DATE: 2024-06-03_18:33:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.829888e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.672325e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.968559e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.521072 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.306399e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.181630e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.378764e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.518721 sec INFO: No Floating Point Exceptions have been reported - 2,127,774,211 cycles # 2.822 GHz - 3,322,390,866 instructions # 1.56 insn per cycle - 0.812598155 seconds time elapsed + 2,091,185,845 cycles # 2.826 GHz + 3,304,915,518 instructions # 1.58 insn per cycle + 0.797417153 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.188202e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.259969e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.259969e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.881998e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934912e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.880866 sec +TOTAL : 5.660184 sec INFO: No Floating Point Exceptions have been reported - 14,003,432,297 cycles # 2.867 GHz - 38,341,113,505 instructions # 2.74 insn per cycle - 4.886313267 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,228,253,084 cycles # 2.865 GHz + 45,339,793,508 instructions # 2.79 insn per cycle + 5.665482518 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.865782e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.266690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.266690e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.430385e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.762372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.762372e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.245588 sec +TOTAL : 2.456993 sec INFO: No Floating Point Exceptions have been reported - 6,448,223,959 cycles # 2.866 GHz - 15,815,680,836 instructions # 2.45 insn per cycle - 2.250939876 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,054,966,301 cycles # 2.866 GHz + 17,775,169,086 instructions # 2.52 insn per cycle + 2.462396323 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.646940e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.924791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.924791e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.096232e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.198906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.198906e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.302035 sec +TOTAL : 1.384702 sec INFO: No Floating Point Exceptions have been reported - 3,468,694,819 cycles # 2.655 GHz - 7,593,779,362 instructions # 2.19 insn per cycle - 1.307417897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,747,560,129 cycles # 2.697 GHz + 8,265,172,826 instructions # 2.21 insn per cycle + 1.390118552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.341284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.086225e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.086225e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.514573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.761076e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.761076e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.212139 sec +TOTAL : 1.321389 sec INFO: No Floating Point Exceptions have been reported - 3,264,189,304 cycles # 2.683 GHz - 7,202,777,705 instructions # 2.21 insn per cycle - 1.217586001 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,562,769,632 cycles # 2.687 GHz + 7,920,504,953 instructions # 2.22 insn per cycle + 1.326818698 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.695951e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.442709e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.442709e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.212259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.822619e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.822619e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.656278 sec +TOTAL : 1.777315 sec INFO: No Floating Point Exceptions have been reported - 3,079,117,307 cycles # 1.854 GHz - 5,835,044,949 instructions # 1.90 insn per cycle - 1.661779093 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,264,947,794 cycles # 1.833 GHz + 6,099,951,354 instructions # 1.87 insn per cycle + 1.782787601 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 7d521e9bea..58e49d86b2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:33:16 +DATE: 2024-06-03_18:30:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,18 +50,18 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.584988e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.675915e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.967452e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.624537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.179236e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.368664e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.626563 sec +TOTAL : 0.626368 sec INFO: No Floating Point Exceptions have been reported - 2,397,623,441 cycles # 2.823 GHz - 3,754,246,774 instructions # 1.57 insn per cycle - 0.906158297 seconds time elapsed + 2,398,947,371 cycles # 2.829 GHz + 3,752,782,194 instructions # 1.56 insn per cycle + 0.905540211 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -70,8 +70,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -83,16 +83,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.188098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260287e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260287e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.881457e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.935038e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935038e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.881127 sec +TOTAL : 5.662100 sec INFO: No Floating Point Exceptions have been reported - 14,001,505,232 cycles # 2.866 GHz - 38,340,997,313 instructions # 2.74 insn per cycle - 4.886650264 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,227,209,162 cycles # 2.864 GHz + 45,340,117,828 instructions # 2.79 insn per cycle + 5.667445307 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.864903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.265431e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.265431e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.425382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.754402e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.754402e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.245406 sec +TOTAL : 2.460362 sec INFO: No Floating Point Exceptions have been reported - 6,447,851,141 cycles # 2.865 GHz - 15,815,588,340 instructions # 2.45 insn per cycle - 2.250959020 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,057,208,189 cycles # 2.863 GHz + 17,774,934,002 instructions # 2.52 insn per cycle + 2.465805803 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -128,8 +128,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -139,16 +139,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.822705e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013257e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013257e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.060158e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.151567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.151567e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.277005 sec +TOTAL : 1.390519 sec INFO: No Floating Point Exceptions have been reported - 3,462,079,610 cycles # 2.701 GHz - 7,593,569,628 instructions # 2.19 insn per cycle - 1.282260547 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,750,841,770 cycles # 2.689 GHz + 8,266,163,693 instructions # 2.20 insn per cycle + 1.395877768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -156,8 +156,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.456827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.099917e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.099917e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.541765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.784606e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.784606e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.197770 sec +TOTAL : 1.316558 sec INFO: No Floating Point Exceptions have been reported - 3,264,255,856 cycles # 2.715 GHz - 7,202,978,053 instructions # 2.21 insn per cycle - 1.203170555 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,557,676,159 cycles # 2.693 GHz + 7,920,448,927 instructions # 2.23 insn per cycle + 1.321911082 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -195,16 +195,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.703762e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.418726e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.418726e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.236074e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.852334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.852334e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.654681 sec +TOTAL : 1.771281 sec INFO: No Floating Point Exceptions have been reported - 3,071,792,415 cycles # 1.854 GHz - 5,835,969,355 instructions # 1.90 insn per cycle - 1.660064901 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,260,314,929 cycles # 1.836 GHz + 6,099,684,038 instructions # 1.87 insn per cycle + 1.776894036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -212,8 +212,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 8b44c0445b..4f2f03096c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:52:47 +DATE: 2024-06-03_17:59:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.016058e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.669695e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.038322e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.483252 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.575450e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.442883e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.719385e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.491778 sec INFO: No Floating Point Exceptions have been reported - 1,998,362,251 cycles # 2.820 GHz - 2,889,875,535 instructions # 1.45 insn per cycle - 0.765688803 seconds time elapsed + 2,019,426,299 cycles # 2.818 GHz + 2,897,302,470 instructions # 1.43 insn per cycle + 0.776260193 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.138992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.207513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.207513e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915291e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.969931e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969931e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.990603 sec +TOTAL : 5.561920 sec INFO: No Floating Point Exceptions have been reported - 14,312,095,417 cycles # 2.865 GHz - 39,833,075,351 instructions # 2.78 insn per cycle - 4.996093045 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) + 15,980,534,668 cycles # 2.871 GHz + 44,449,710,623 instructions # 2.78 insn per cycle + 5.567058871 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 550) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199028000236 -Relative difference = 4.790961076489297e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.631923e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.176181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.176181e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.192586e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.649473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.649473e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.951672 sec +TOTAL : 2.107774 sec INFO: No Floating Point Exceptions have been reported - 5,596,939,158 cycles # 2.861 GHz - 15,284,742,297 instructions # 2.73 insn per cycle - 1.956996618 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) + 6,057,620,501 cycles # 2.868 GHz + 17,081,636,340 instructions # 2.82 insn per cycle + 2.112968987 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.218680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.834401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.834401e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.911096e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.464242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.464242e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.775280 sec +TOTAL : 1.861423 sec INFO: No Floating Point Exceptions have been reported - 4,757,273,897 cycles # 2.673 GHz - 9,734,524,466 instructions # 2.05 insn per cycle - 1.780718117 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) + 5,014,435,050 cycles # 2.687 GHz + 10,230,180,171 instructions # 2.04 insn per cycle + 1.866720473 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.294887e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.927248e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.927248e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.975955e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.541648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.541648e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.754842 sec +TOTAL : 1.842744 sec INFO: No Floating Point Exceptions have been reported - 4,627,741,209 cycles # 2.630 GHz - 9,326,813,558 instructions # 2.02 insn per cycle - 1.760340221 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) + 4,967,642,651 cycles # 2.690 GHz + 10,001,951,713 instructions # 2.01 insn per cycle + 1.847768308 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3824) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.443462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.905052e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.905052e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.478820e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.786579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.786579e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.015644 sec +TOTAL : 2.429927 sec INFO: No Floating Point Exceptions have been reported - 3,664,457,552 cycles # 1.814 GHz - 7,034,592,113 instructions # 1.92 insn per cycle - 2.021014520 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2610) (512y: 12) (512z: 2220) + 4,360,807,561 cycles # 1.792 GHz + 8,449,096,692 instructions # 1.94 insn per cycle + 2.435243753 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2897) (512y: 4) (512z: 2751) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183459779248 -Relative difference = 1.7053177021099307e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 0b4aad6d48..116d48b4c8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:19:09 +DATE: 2024-06-03_18:16:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.536351e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.649763e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.975434e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.490081 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.893743e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.209215e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382460e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.485128 sec INFO: No Floating Point Exceptions have been reported - 2,012,235,376 cycles # 2.813 GHz - 2,887,278,665 instructions # 1.43 insn per cycle - 0.773980012 seconds time elapsed + 1,998,933,549 cycles # 2.818 GHz + 2,886,521,387 instructions # 1.44 insn per cycle + 0.766563752 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.387689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.473438e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.473438e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.442511e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.532646e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.532646e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.481561 sec +TOTAL : 4.381681 sec INFO: No Floating Point Exceptions have been reported - 12,595,769,062 cycles # 2.808 GHz - 34,371,859,733 instructions # 2.73 insn per cycle - 4.486981898 seconds time elapsed + 12,577,682,953 cycles # 2.868 GHz + 34,626,967,775 instructions # 2.75 insn per cycle + 4.386763817 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.156182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.609115e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.609115e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.188953e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.647273e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.647273e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.124099 sec +TOTAL : 2.110381 sec INFO: No Floating Point Exceptions have been reported - 6,097,938,896 cycles # 2.864 GHz - 14,860,412,482 instructions # 2.44 insn per cycle - 2.129667458 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) + 6,062,164,948 cycles # 2.867 GHz + 14,850,935,565 instructions # 2.45 insn per cycle + 2.115526112 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193803280592 -Relative difference = 1.8746278463897685e-07 +Avg ME (F77/C++) = 2.0288193414453417 +Relative difference = 1.6829758681196702e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.956935e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.736211e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.736211e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.948644e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.746008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.746008e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.596219 sec +TOTAL : 1.598307 sec INFO: No Floating Point Exceptions have been reported - 4,283,389,233 cycles # 2.675 GHz - 9,028,537,855 instructions # 2.11 insn per cycle - 1.601731069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) + 4,310,676,440 cycles # 2.690 GHz + 9,056,395,427 instructions # 2.10 insn per cycle + 1.603504485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4470) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288181974319741 +Relative difference = 9.731379272303266e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.081544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.893467e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.893467e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.149704e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.008490e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.008490e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.569456 sec +TOTAL : 1.555197 sec INFO: No Floating Point Exceptions have been reported - 4,194,103,331 cycles # 2.666 GHz - 8,663,712,018 instructions # 2.07 insn per cycle - 1.575061670 seconds time elapsed + 4,190,493,094 cycles # 2.688 GHz + 8,664,572,975 instructions # 2.07 insn per cycle + 1.560446046 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288181974319741 +Relative difference = 9.731379272303266e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.158310e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.571602e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.571602e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.176266e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.591500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.591500e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.121701 sec +TOTAL : 2.115034 sec INFO: No Floating Point Exceptions have been reported - 3,840,319,125 cycles # 1.806 GHz - 7,808,340,953 instructions # 2.03 insn per cycle - 2.127208383 seconds time elapsed + 3,838,944,051 cycles # 1.811 GHz + 7,808,393,263 instructions # 2.03 insn per cycle + 2.120392458 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4424) (512y: 0) (512z: 2555) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 68145ed810..71aecc0e65 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:19:29 +DATE: 2024-06-03_18:16:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.581017e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.707013e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.045464e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.486421 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.042762e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.498495e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.721786e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.481742 sec INFO: No Floating Point Exceptions have been reported - 2,005,170,721 cycles # 2.817 GHz - 2,885,331,094 instructions # 1.44 insn per cycle - 0.769664626 seconds time elapsed + 1,994,870,659 cycles # 2.822 GHz + 2,887,372,486 instructions # 1.45 insn per cycle + 0.763571243 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.592369e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.693697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.693697e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.582587e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.683226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.683226e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.133554 sec +TOTAL : 4.148957 sec INFO: No Floating Point Exceptions have been reported - 11,751,435,134 cycles # 2.840 GHz - 35,107,900,053 instructions # 2.99 insn per cycle - 4.139079374 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) + 11,805,670,893 cycles # 2.843 GHz + 35,094,586,200 instructions # 2.97 insn per cycle + 4.154168605 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.304158e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.779755e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.779755e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.294390e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.771165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.771165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.066374 sec +TOTAL : 2.069902 sec INFO: No Floating Point Exceptions have been reported - 5,955,734,514 cycles # 2.876 GHz - 14,470,820,860 instructions # 2.43 insn per cycle - 2.071726681 seconds time elapsed + 5,951,056,214 cycles # 2.869 GHz + 14,470,509,640 instructions # 2.43 insn per cycle + 2.075071006 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.262038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.115917e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.115917e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.194905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053976e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.531929 sec +TOTAL : 1.546412 sec INFO: No Floating Point Exceptions have been reported - 4,141,893,808 cycles # 2.695 GHz - 8,874,492,613 instructions # 2.14 insn per cycle - 1.537451545 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) + 4,171,092,335 cycles # 2.689 GHz + 8,883,426,870 instructions # 2.13 insn per cycle + 1.551600108 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3580) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288182104704902 +Relative difference = 1.0374044905426431e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.243724e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.092067e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.092067e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.232209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.105029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.105029e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.535382 sec +TOTAL : 1.537952 sec INFO: No Floating Point Exceptions have been reported - 4,153,568,465 cycles # 2.697 GHz - 8,412,828,463 instructions # 2.03 insn per cycle - 1.540818202 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) + 4,138,913,630 cycles # 2.684 GHz + 8,410,635,266 instructions # 2.03 insn per cycle + 1.543068814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3314) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288182104704902 +Relative difference = 1.0374044905426431e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.345549e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.788380e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.788380e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.264406e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.694500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.694500e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.050082 sec +TOTAL : 2.081032 sec INFO: No Floating Point Exceptions have been reported - 3,776,688,425 cycles # 1.838 GHz - 7,700,644,489 instructions # 2.04 insn per cycle - 2.055466903 seconds time elapsed + 3,784,430,137 cycles # 1.815 GHz + 7,701,504,575 instructions # 2.04 insn per cycle + 2.086318740 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3440) (512y: 0) (512z: 2107) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index ac74dccede..a7f36bcee1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:53:07 +DATE: 2024-06-03_17:59:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.554923e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.166179e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277525e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.507163e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163262e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276787e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.530564 sec +TOTAL : 0.538303 sec INFO: No Floating Point Exceptions have been reported - 2,183,555,188 cycles # 2.822 GHz - 3,144,815,139 instructions # 1.44 insn per cycle - 0.830734274 seconds time elapsed + 2,165,151,740 cycles # 2.801 GHz + 3,099,392,657 instructions # 1.43 insn per cycle + 0.834137975 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.007712e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.065929e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.065929e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.771690e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816324e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.334891 sec +TOTAL : 6.027557 sec INFO: No Floating Point Exceptions have been reported - 15,289,472,900 cycles # 2.864 GHz - 38,577,953,274 instructions # 2.52 insn per cycle - 5.340497151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) + 17,393,761,998 cycles # 2.884 GHz + 46,095,398,077 instructions # 2.65 insn per cycle + 6.033053485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.464249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.655908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.655908e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.091341e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.242349e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.242349e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.137831 sec +TOTAL : 3.502392 sec INFO: No Floating Point Exceptions have been reported - 8,973,826,641 cycles # 2.856 GHz - 24,223,107,065 instructions # 2.70 insn per cycle - 3.143393706 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) + 9,983,647,191 cycles # 2.847 GHz + 27,594,046,832 instructions # 2.76 insn per cycle + 3.507812390 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2593) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.480038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.956414e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.956414e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.924038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.302955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.302955e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.024686 sec +TOTAL : 2.239342 sec INFO: No Floating Point Exceptions have been reported - 5,399,489,629 cycles # 2.661 GHz - 11,276,345,804 instructions # 2.09 insn per cycle - 2.030286202 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) + 6,002,425,078 cycles # 2.675 GHz + 12,490,276,444 instructions # 2.08 insn per cycle + 2.244729020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2780) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.087786e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.678670e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.678670e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.391265e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.848955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.848955e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.832786 sec +TOTAL : 2.055015 sec INFO: No Floating Point Exceptions have been reported - 4,867,249,834 cycles # 2.649 GHz - 10,525,904,310 instructions # 2.16 insn per cycle - 1.838359941 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) + 5,505,020,906 cycles # 2.674 GHz + 11,927,639,782 instructions # 2.17 insn per cycle + 2.060518547 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2531) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.703127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.911454e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.911454e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.468701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.651354e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.651354e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.942466 sec +TOTAL : 3.133138 sec INFO: No Floating Point Exceptions have been reported - 5,244,087,950 cycles # 1.780 GHz - 7,604,896,768 instructions # 1.45 insn per cycle - 2.947952496 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) + 5,572,843,285 cycles # 1.776 GHz + 8,116,354,124 instructions # 1.46 insn per cycle + 3.138888887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1668) (512y: 126) (512z: 1862) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index e93587dbb1..0d17bfb092 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:53:31 +DATE: 2024-06-03_18:00:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.488756e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158798e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278167e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.507488e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165492e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276653e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532803 sec +TOTAL : 0.530194 sec INFO: No Floating Point Exceptions have been reported - 2,183,879,430 cycles # 2.808 GHz - 3,134,949,776 instructions # 1.44 insn per cycle - 0.834793767 seconds time elapsed + 2,167,878,364 cycles # 2.823 GHz + 3,109,854,489 instructions # 1.43 insn per cycle + 0.825240075 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.997980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.055486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.055486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.805887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.852897e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852897e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.359665 sec +TOTAL : 5.915886 sec INFO: No Floating Point Exceptions have been reported - 15,360,836,715 cycles # 2.864 GHz - 40,374,148,262 instructions # 2.63 insn per cycle - 5.365170950 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) + 16,977,594,162 cycles # 2.868 GHz + 45,124,282,986 instructions # 2.66 insn per cycle + 5.921217258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.653686e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.867948e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.867948e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.290200e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.461592e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.461592e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.980978 sec +TOTAL : 3.297067 sec INFO: No Floating Point Exceptions have been reported - 8,537,424,840 cycles # 2.860 GHz - 23,255,933,901 instructions # 2.72 insn per cycle - 2.986536514 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) + 9,474,741,584 cycles # 2.870 GHz + 26,246,059,686 instructions # 2.77 insn per cycle + 3.302677343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2397) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.675454e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.025638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.025638e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.354754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.651843e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.651843e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.354095 sec +TOTAL : 2.518683 sec INFO: No Floating Point Exceptions have been reported - 6,271,057,831 cycles # 2.659 GHz - 12,961,948,705 instructions # 2.07 insn per cycle - 2.359820621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) + 6,735,700,717 cycles # 2.669 GHz + 14,039,091,079 instructions # 2.08 insn per cycle + 2.524292410 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.936868e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.317677e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.317677e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.594752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.925919e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.925919e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.234617 sec +TOTAL : 2.392171 sec INFO: No Floating Point Exceptions have been reported - 5,930,155,286 cycles # 2.648 GHz - 12,239,916,481 instructions # 2.06 insn per cycle - 2.240091992 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) + 6,402,125,730 cycles # 2.671 GHz + 13,528,701,240 instructions # 2.11 insn per cycle + 2.397739364 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2547) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.444871e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.625260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.625260e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.464006e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.644808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.644808e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.154913 sec +TOTAL : 3.137581 sec INFO: No Floating Point Exceptions have been reported - 5,603,434,208 cycles # 1.774 GHz - 8,746,306,669 instructions # 1.56 insn per cycle - 3.160521781 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) + 5,585,907,221 cycles # 1.778 GHz + 9,215,484,758 instructions # 1.65 insn per cycle + 3.143155754 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1453) (512y: 212) (512z: 2059) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d2a9436bac..ebda5e548c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:53:55 +DATE: 2024-06-03_18:00:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.828792e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058201e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072605e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.496575e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.048174e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064535e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469991 sec +TOTAL : 0.472291 sec INFO: No Floating Point Exceptions have been reported - 1,950,502,519 cycles # 2.813 GHz - 2,806,448,865 instructions # 1.44 insn per cycle - 0.749484229 seconds time elapsed + 1,944,574,836 cycles # 2.817 GHz + 2,773,147,259 instructions # 1.43 insn per cycle + 0.747270416 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.080589e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.327376e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341190e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.086792e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.326818e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.340800e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609725 sec +TOTAL : 0.615400 sec INFO: No Floating Point Exceptions have been reported - 2,396,738,847 cycles # 2.820 GHz - 3,668,256,125 instructions # 1.53 insn per cycle - 0.908964727 seconds time elapsed + 2,409,226,641 cycles # 2.819 GHz + 3,595,245,677 instructions # 1.49 insn per cycle + 0.915650252 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.392275e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.404239e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.404239e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.374863e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.386689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.386689e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.873703 sec +TOTAL : 6.922997 sec INFO: No Floating Point Exceptions have been reported - 19,791,549,123 cycles # 2.878 GHz - 59,606,317,603 instructions # 3.01 insn per cycle - 6.878041961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) + 19,884,239,052 cycles # 2.871 GHz + 59,920,809,546 instructions # 3.01 insn per cycle + 6.927215258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1212) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.568257e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.612303e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.612303e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.463550e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.505286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.505286e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.610171 sec +TOTAL : 3.694475 sec INFO: No Floating Point Exceptions have been reported - 10,370,530,942 cycles # 2.870 GHz - 30,676,186,235 instructions # 2.96 insn per cycle - 3.614641811 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) + 10,604,185,605 cycles # 2.867 GHz + 31,094,882,685 instructions # 2.93 insn per cycle + 3.698744087 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5233) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.953227e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.119594e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.119594e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.858143e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.020160e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.020160e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.853859 sec +TOTAL : 1.873129 sec INFO: No Floating Point Exceptions have been reported - 4,895,759,086 cycles # 2.636 GHz - 11,018,740,137 instructions # 2.25 insn per cycle - 1.858168783 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) + 4,996,519,277 cycles # 2.663 GHz + 11,413,075,235 instructions # 2.28 insn per cycle + 1.877381447 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4653) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.002285e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.022719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.022719e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.939252e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.013970e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.013970e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.658467 sec +TOTAL : 1.671902 sec INFO: No Floating Point Exceptions have been reported - 4,377,789,034 cycles # 2.634 GHz - 10,296,146,857 instructions # 2.35 insn per cycle - 1.662826466 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) + 4,459,214,327 cycles # 2.662 GHz + 10,671,791,558 instructions # 2.39 insn per cycle + 1.676188201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4395) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.862327e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.962239e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.962239e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.825648e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.923292e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.923292e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.413722 sec +TOTAL : 2.426057 sec INFO: No Floating Point Exceptions have been reported - 4,103,494,859 cycles # 1.698 GHz - 5,842,470,718 instructions # 1.42 insn per cycle - 2.418058321 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) + 4,141,058,833 cycles # 1.704 GHz + 5,974,244,939 instructions # 1.44 insn per cycle + 2.430391084 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1629) (512y: 95) (512z: 3576) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index a85c881c90..eb34157c53 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_21:28:04 +DATE: 2024-06-03_18:25:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.555829e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.818637e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.818637e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.584531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.976756e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.976756e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.500626 sec +TOTAL : 0.499776 sec INFO: No Floating Point Exceptions have been reported - 2,009,885,240 cycles # 2.812 GHz - 3,066,226,195 instructions # 1.53 insn per cycle - 0.771725422 seconds time elapsed + 2,012,995,564 cycles # 2.821 GHz + 3,041,094,374 instructions # 1.51 insn per cycle + 0.770525018 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.701616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.932321e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.932321e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.717101e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.982761e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.982761e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.829861 sec +TOTAL : 0.825846 sec INFO: No Floating Point Exceptions have been reported - 3,061,531,665 cycles # 2.832 GHz - 4,949,277,454 instructions # 1.62 insn per cycle - 1.138627743 seconds time elapsed + 3,047,783,214 cycles # 2.833 GHz + 4,940,094,414 instructions # 1.62 insn per cycle + 1.136340341 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.385150e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.397380e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.397380e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.364897e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.377090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.377090e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.902584 sec +TOTAL : 6.960938 sec INFO: No Floating Point Exceptions have been reported - 19,794,872,013 cycles # 2.866 GHz - 59,611,558,170 instructions # 3.01 insn per cycle - 6.907141027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) + 19,949,623,161 cycles # 2.865 GHz + 59,928,901,076 instructions # 3.00 insn per cycle + 6.965534824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1212) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.557869e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.602026e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.602026e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.452940e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.495356e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.495356e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.626237 sec +TOTAL : 3.712004 sec INFO: No Floating Point Exceptions have been reported - 10,407,659,030 cycles # 2.867 GHz - 30,722,342,234 instructions # 2.95 insn per cycle - 3.630843347 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) + 10,658,944,463 cycles # 2.869 GHz + 31,146,122,122 instructions # 2.92 insn per cycle + 3.716787966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5233) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.913116e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.082779e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.082779e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.837623e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.005313e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.005313e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.870398 sec +TOTAL : 1.887457 sec INFO: No Floating Point Exceptions have been reported - 4,945,675,416 cycles # 2.639 GHz - 11,067,795,090 instructions # 2.24 insn per cycle - 1.874949750 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) + 5,042,590,042 cycles # 2.666 GHz + 11,463,290,873 instructions # 2.27 insn per cycle + 1.892080136 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4653) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.991691e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.020412e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.020412e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.912443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.011944e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.011944e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.672210 sec +TOTAL : 1.684689 sec INFO: No Floating Point Exceptions have been reported - 4,419,204,279 cycles # 2.637 GHz - 10,345,034,833 instructions # 2.34 insn per cycle - 1.676771727 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) + 4,502,957,413 cycles # 2.667 GHz + 10,720,861,961 instructions # 2.38 insn per cycle + 1.689237193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4395) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.843006e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.942637e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.942637e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.781909e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.879331e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.879331e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.429398 sec +TOTAL : 2.451274 sec INFO: No Floating Point Exceptions have been reported - 4,154,030,623 cycles # 1.707 GHz - 5,882,157,165 instructions # 1.42 insn per cycle - 2.434135528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) + 4,196,086,430 cycles # 1.709 GHz + 6,013,875,420 instructions # 1.43 insn per cycle + 2.455998982 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1629) (512y: 95) (512z: 3576) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 9c5400dc3c..a30e15379c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:54:21 +DATE: 2024-06-03_18:01:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.728787e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040296e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054902e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.445164e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040455e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056751e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469961 sec +TOTAL : 0.474176 sec INFO: No Floating Point Exceptions have been reported - 1,945,140,914 cycles # 2.814 GHz - 2,797,132,683 instructions # 1.44 insn per cycle - 0.748195009 seconds time elapsed + 1,943,847,656 cycles # 2.811 GHz + 2,740,790,877 instructions # 1.41 insn per cycle + 0.749383492 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.071125e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312843e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325872e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.078272e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313524e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326650e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609321 sec +TOTAL : 0.608270 sec INFO: No Floating Point Exceptions have been reported - 2,388,183,446 cycles # 2.820 GHz - 3,624,721,355 instructions # 1.52 insn per cycle - 0.907535221 seconds time elapsed + 2,398,054,354 cycles # 2.828 GHz + 3,623,922,979 instructions # 1.51 insn per cycle + 0.907154023 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.416052e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.428253e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.428253e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.369615e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.381392e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.381392e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.805799 sec +TOTAL : 6.938575 sec INFO: No Floating Point Exceptions have been reported - 19,509,656,309 cycles # 2.865 GHz - 58,797,581,425 instructions # 3.01 insn per cycle - 6.810019541 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) + 19,924,877,146 cycles # 2.871 GHz + 60,135,086,712 instructions # 3.02 insn per cycle + 6.942820252 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1335) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.624636e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.669607e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.669607e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.537192e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.580141e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.580141e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.566261 sec +TOTAL : 3.634362 sec INFO: No Floating Point Exceptions have been reported - 10,224,672,260 cycles # 2.864 GHz - 30,345,523,778 instructions # 2.97 insn per cycle - 3.570675168 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) + 10,428,527,121 cycles # 2.867 GHz + 30,694,765,319 instructions # 2.94 insn per cycle + 3.638602748 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5059) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.622392e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.780322e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.780322e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.601162e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.754382e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.754382e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.924079 sec +TOTAL : 1.928356 sec INFO: No Floating Point Exceptions have been reported - 5,063,711,278 cycles # 2.628 GHz - 11,483,381,207 instructions # 2.27 insn per cycle - 1.928457003 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) + 5,139,069,824 cycles # 2.660 GHz + 11,845,314,592 instructions # 2.30 insn per cycle + 1.932635801 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4759) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.428126e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.609483e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.609483e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.349025e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.527469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.527469e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.762110 sec +TOTAL : 1.775911 sec INFO: No Floating Point Exceptions have been reported - 4,654,381,781 cycles # 2.637 GHz - 10,841,512,729 instructions # 2.33 insn per cycle - 1.766492012 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) + 4,732,346,737 cycles # 2.660 GHz + 11,170,822,632 instructions # 2.36 insn per cycle + 1.780229441 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4420) (512y: 245) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.831561e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.933974e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.933974e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.768854e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.866037e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.866037e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.424320 sec +TOTAL : 2.446344 sec INFO: No Floating Point Exceptions have been reported - 4,122,188,832 cycles # 1.698 GHz - 6,106,386,209 instructions # 1.48 insn per cycle - 2.428663337 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) + 4,167,220,480 cycles # 1.701 GHz + 6,225,852,056 instructions # 1.49 insn per cycle + 2.450525643 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1525) (512y: 140) (512z: 3678) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 25f5a9a1db..d5011c542d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:54:47 +DATE: 2024-06-03_18:01:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.457737e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273716e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366330e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.164978e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.938816e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.022860e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.454346 sec +TOTAL : 0.455835 sec INFO: No Floating Point Exceptions have been reported - 1,882,812,006 cycles # 2.812 GHz - 2,667,678,817 instructions # 1.42 insn per cycle - 0.728316472 seconds time elapsed + 1,889,397,429 cycles # 2.815 GHz + 2,651,505,840 instructions # 1.40 insn per cycle + 0.728542305 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 227 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.267220e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.427979e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.526801e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.499240 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.909385e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.914593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.984981e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 +TOTAL : 0.505996 sec INFO: No Floating Point Exceptions have been reported - 2,052,043,922 cycles # 2.819 GHz - 2,965,868,392 instructions # 1.45 insn per cycle - 0.784455397 seconds time elapsed + 2,072,435,695 cycles # 2.821 GHz + 3,006,042,205 instructions # 1.45 insn per cycle + 0.792850900 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412608e+00 -Avg ME (F77/GPU) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214305330990 +Relative difference = 0.0004349621183379836 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,25 +96,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.468259e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.481049e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.481049e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.455284e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.468142e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.468142e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.660760 sec +TOTAL : 6.697187 sec INFO: No Floating Point Exceptions have been reported - 19,087,341,831 cycles # 2.864 GHz - 58,960,382,092 instructions # 3.09 insn per cycle - 6.664849133 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) + 19,231,773,979 cycles # 2.870 GHz + 59,620,868,157 instructions # 3.10 insn per cycle + 6.701301879 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 972) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -124,25 +124,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.119436e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.261857e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.261857e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.883262e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.018595e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.018595e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.039807 sec +TOTAL : 2.099868 sec INFO: No Floating Point Exceptions have been reported - 5,851,713,678 cycles # 2.864 GHz - 16,693,562,801 instructions # 2.85 insn per cycle - 2.044009980 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) + 6,023,746,068 cycles # 2.864 GHz + 17,069,553,194 instructions # 2.83 insn per cycle + 2.104057020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5867) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954647353316 +Relative difference = 3.2890090308261873e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -152,25 +152,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.728939e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.791004e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.791004e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.969206 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.693129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.752626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.752626e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.989464 sec INFO: No Floating Point Exceptions have been reported - 2,595,644,078 cycles # 2.669 GHz - 5,979,320,953 instructions # 2.30 insn per cycle - 0.973332836 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) + 2,644,653,272 cycles # 2.663 GHz + 6,193,862,153 instructions # 2.34 insn per cycle + 0.993621228 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,25 +180,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.907804e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.986792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.986792e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.881217 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.865244e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.937490e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.937490e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.900025 sec INFO: No Floating Point Exceptions have been reported - 2,345,880,719 cycles # 2.652 GHz - 5,602,748,051 instructions # 2.39 insn per cycle - 0.885574348 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) + 2,409,522,680 cycles # 2.666 GHz + 5,798,061,377 instructions # 2.41 insn per cycle + 0.904266583 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4920) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.406889e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.449235e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.449235e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.387108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.427869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.427869e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.188517 sec +TOTAL : 1.205183 sec INFO: No Floating Point Exceptions have been reported - 2,058,034,828 cycles # 1.727 GHz - 3,333,328,616 instructions # 1.62 insn per cycle - 1.192698457 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) + 2,084,041,087 cycles # 1.725 GHz + 3,398,650,768 instructions # 1.63 insn per cycle + 1.209392897 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2238) (512y: 39) (512z: 3787) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index e87a092429..27d39c227c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_21:28:30 +DATE: 2024-06-03_18:25:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.706813e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.038920e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.038920e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.785942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.033343e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.033343e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.466185 sec +TOTAL : 0.464883 sec INFO: No Floating Point Exceptions have been reported - 1,911,951,129 cycles # 2.814 GHz - 2,841,120,455 instructions # 1.49 insn per cycle - 0.735601781 seconds time elapsed + 1,914,723,949 cycles # 2.822 GHz + 2,841,749,042 instructions # 1.48 insn per cycle + 0.734851797 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 227 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= @@ -79,24 +79,24 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.584141e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.645916e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.645916e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.648144 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.584091e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.526997e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.526997e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 +TOTAL : 0.646853 sec INFO: No Floating Point Exceptions have been reported - 2,489,864,035 cycles # 2.825 GHz - 3,827,644,098 instructions # 1.54 insn per cycle - 0.938216487 seconds time elapsed + 2,489,386,165 cycles # 2.829 GHz + 3,848,805,319 instructions # 1.55 insn per cycle + 0.936355064 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412608e+00 -Avg ME (F77/GPU) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214305330990 +Relative difference = 0.0004349621183379836 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.480910e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.493960e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.493960e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.451669e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.464494e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.464494e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.630728 sec +TOTAL : 6.709628 sec INFO: No Floating Point Exceptions have been reported - 19,100,779,078 cycles # 2.879 GHz - 58,964,120,971 instructions # 3.09 insn per cycle - 6.635020831 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) + 19,244,071,354 cycles # 2.867 GHz + 59,624,819,172 instructions # 3.10 insn per cycle + 6.714018744 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 972) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -138,25 +138,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.112476e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.258255e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.258255e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.876682e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.018166e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.018166e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.046410 sec +TOTAL : 2.106720 sec INFO: No Floating Point Exceptions have been reported - 5,888,238,325 cycles # 2.872 GHz - 16,741,878,300 instructions # 2.84 insn per cycle - 2.050817243 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) + 6,048,127,238 cycles # 2.866 GHz + 17,116,783,460 instructions # 2.83 insn per cycle + 2.111089880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5867) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954647353316 +Relative difference = 3.2890090308261873e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -167,25 +167,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.723304e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.786360e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.786360e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.977338 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.653152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.712585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.712585e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.017936 sec INFO: No Floating Point Exceptions have been reported - 2,615,739,499 cycles # 2.667 GHz - 6,017,192,558 instructions # 2.30 insn per cycle - 0.981679944 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) + 2,674,239,143 cycles # 2.617 GHz + 6,230,764,637 instructions # 2.33 insn per cycle + 1.022355051 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,25 +196,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.915953e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.994516e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.994516e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.881911 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.856094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929527e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929527e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.910035 sec INFO: No Floating Point Exceptions have been reported - 2,367,964,767 cycles # 2.674 GHz - 5,639,235,283 instructions # 2.38 insn per cycle - 0.886220730 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) + 2,430,364,587 cycles # 2.661 GHz + 5,835,243,014 instructions # 2.40 insn per cycle + 0.914365378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4920) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.400847e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.442998e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.442998e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.383498e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.424523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.424523e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.198499 sec +TOTAL : 1.213191 sec INFO: No Floating Point Exceptions have been reported - 2,084,095,957 cycles # 1.733 GHz - 3,374,916,702 instructions # 1.62 insn per cycle - 1.202990814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) + 2,110,463,788 cycles # 1.734 GHz + 3,440,119,960 instructions # 1.63 insn per cycle + 1.217545966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2238) (512y: 39) (512z: 3787) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index b3b78f68de..016a71e02c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:55:08 +DATE: 2024-06-03_18:02:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.497603e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.303121e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.398772e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.157640e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.930293e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.025008e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.451033 sec +TOTAL : 0.454926 sec INFO: No Floating Point Exceptions have been reported - 1,878,344,567 cycles # 2.819 GHz - 2,673,672,832 instructions # 1.42 insn per cycle - 0.723231427 seconds time elapsed + 1,891,883,564 cycles # 2.818 GHz + 2,657,665,460 instructions # 1.40 insn per cycle + 0.728906539 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 221 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.242647e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.389213e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.475124e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.498086 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.937550e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.960296e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.028764e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 +TOTAL : 0.506114 sec INFO: No Floating Point Exceptions have been reported - 2,053,404,426 cycles # 2.826 GHz - 2,966,260,637 instructions # 1.44 insn per cycle - 0.782979940 seconds time elapsed + 2,076,281,969 cycles # 2.822 GHz + 3,000,969,388 instructions # 1.45 insn per cycle + 0.793032794 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412608e+00 -Avg ME (F77/GPU) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214305330990 +Relative difference = 0.0004349621183379836 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,25 +96,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.479080e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.492104e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.492104e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.439985e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.452524e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.452524e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.631451 sec +TOTAL : 6.737070 sec INFO: No Floating Point Exceptions have been reported - 18,984,604,647 cycles # 2.862 GHz - 58,702,110,153 instructions # 3.09 insn per cycle - 6.635623922 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) + 19,431,566,977 cycles # 2.883 GHz + 59,361,173,188 instructions # 3.05 insn per cycle + 6.741226799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1040) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -124,25 +124,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.506447e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.662703e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.662703e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.308400e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.454140e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.454140e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.947376 sec +TOTAL : 1.992790 sec INFO: No Floating Point Exceptions have been reported - 5,589,202,869 cycles # 2.865 GHz - 16,510,174,954 instructions # 2.95 insn per cycle - 1.951631264 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) + 5,753,899,727 cycles # 2.883 GHz + 16,856,331,975 instructions # 2.93 insn per cycle + 1.996902777 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954647353316 +Relative difference = 3.2890090308261873e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -152,25 +152,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.493410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.539841e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.539841e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.118677 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.500767e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.547334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.547334e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.113231 sec INFO: No Floating Point Exceptions have been reported - 2,976,233,441 cycles # 2.652 GHz - 6,633,667,708 instructions # 2.23 insn per cycle - 1.122890193 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) + 3,022,289,723 cycles # 2.706 GHz + 6,854,476,939 instructions # 2.27 insn per cycle + 1.117430952 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5739) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,25 +180,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.625860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.680500e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.680500e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.029211 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.597334e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.650106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.650106e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.047323 sec INFO: No Floating Point Exceptions have been reported - 2,757,551,412 cycles # 2.670 GHz - 6,254,933,924 instructions # 2.27 insn per cycle - 1.033320955 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) + 2,810,159,694 cycles # 2.674 GHz + 6,444,290,635 instructions # 2.29 insn per cycle + 1.051666815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5521) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.290714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.325632e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325632e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.272649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306381e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.306381e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.293262 sec +TOTAL : 1.311231 sec INFO: No Floating Point Exceptions have been reported - 2,228,539,636 cycles # 1.719 GHz - 3,697,845,631 instructions # 1.66 insn per cycle - 1.297458184 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2391) (512y: 29) (512z: 3970) + 2,260,530,890 cycles # 1.720 GHz + 3,762,323,072 instructions # 1.66 insn per cycle + 1.315383432 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2469) (512y: 29) (512z: 4082) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 1aea1ca46b..7c9cec7e5b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:55:29 +DATE: 2024-06-03_18:02:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.705675e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040108e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054231e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.364718e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.031397e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.047638e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.467689 sec +TOTAL : 0.473744 sec INFO: No Floating Point Exceptions have been reported - 1,951,012,614 cycles # 2.822 GHz - 2,780,514,605 instructions # 1.43 insn per cycle - 0.747566497 seconds time elapsed + 1,947,725,894 cycles # 2.818 GHz + 2,805,290,478 instructions # 1.44 insn per cycle + 0.747729470 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.071549e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.315004e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.328475e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.079036e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315033e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.328388e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609446 sec +TOTAL : 0.611082 sec INFO: No Floating Point Exceptions have been reported - 2,398,520,319 cycles # 2.827 GHz - 3,697,559,551 instructions # 1.54 insn per cycle - 0.906881942 seconds time elapsed + 2,407,302,120 cycles # 2.826 GHz + 3,697,067,837 instructions # 1.54 insn per cycle + 0.910744181 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.346860e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.358448e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.358448e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.334345e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345812e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345812e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.005881 sec +TOTAL : 7.042999 sec INFO: No Floating Point Exceptions have been reported - 20,061,545,492 cycles # 2.863 GHz - 60,534,513,586 instructions # 3.02 insn per cycle - 7.010263470 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) + 20,221,850,441 cycles # 2.870 GHz + 60,954,392,946 instructions # 3.01 insn per cycle + 7.047197015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1233) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.628417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.673382e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.673382e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.519746e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.562485e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.562485e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.563203 sec +TOTAL : 3.648318 sec INFO: No Floating Point Exceptions have been reported - 10,193,843,427 cycles # 2.858 GHz - 30,384,715,959 instructions # 2.98 insn per cycle - 3.567486704 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) + 10,489,870,662 cycles # 2.873 GHz + 30,832,759,019 instructions # 2.94 insn per cycle + 3.652585279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5362) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.060307e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.230512e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.230512e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.905378e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.068897e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.068897e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.832180 sec +TOTAL : 1.863571 sec INFO: No Floating Point Exceptions have been reported - 4,873,743,401 cycles # 2.655 GHz - 10,979,146,931 instructions # 2.25 insn per cycle - 1.836546702 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) + 4,962,857,454 cycles # 2.658 GHz + 11,366,629,197 instructions # 2.29 insn per cycle + 1.867862143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4782) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.032510e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.054207e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054207e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.014145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.035087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.035087e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.610269 sec +TOTAL : 1.638933 sec INFO: No Floating Point Exceptions have been reported - 4,286,427,813 cycles # 2.656 GHz - 10,247,731,306 instructions # 2.39 insn per cycle - 1.614556045 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) + 4,385,977,782 cycles # 2.670 GHz + 10,616,380,005 instructions # 2.42 insn per cycle + 1.643255169 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4515) (512y: 83) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.692625e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.784575e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.784575e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.631134e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.722110e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.722110e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.474016 sec +TOTAL : 2.496704 sec INFO: No Floating Point Exceptions have been reported - 4,210,263,291 cycles # 1.700 GHz - 6,043,220,655 instructions # 1.44 insn per cycle - 2.478297594 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) + 4,254,957,859 cycles # 1.702 GHz + 6,172,800,294 instructions # 1.45 insn per cycle + 2.500959660 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2141) (512y: 117) (512z: 3652) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 1c6d0ff5f8..7f0d5e8677 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:55:55 +DATE: 2024-06-03_18:02:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.735452e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041244e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.055299e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.462653e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041036e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057390e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469831 sec +TOTAL : 0.473993 sec INFO: No Floating Point Exceptions have been reported - 1,948,866,708 cycles # 2.819 GHz - 2,802,491,668 instructions # 1.44 insn per cycle - 0.748686662 seconds time elapsed + 1,945,651,451 cycles # 2.818 GHz + 2,806,210,093 instructions # 1.44 insn per cycle + 0.748369834 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.070810e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308874e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321967e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.075770e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309834e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323116e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.605052 sec +TOTAL : 0.609160 sec INFO: No Floating Point Exceptions have been reported - 2,392,252,084 cycles # 2.830 GHz - 3,645,625,496 instructions # 1.52 insn per cycle - 0.903478683 seconds time elapsed + 2,396,352,595 cycles # 2.825 GHz + 3,579,944,538 instructions # 1.49 insn per cycle + 0.909918067 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.367403e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.379210e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.379210e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.327535e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.338914e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338914e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.944976 sec +TOTAL : 7.062977 sec INFO: No Floating Point Exceptions have been reported - 19,868,797,568 cycles # 2.860 GHz - 59,935,823,047 instructions # 3.02 insn per cycle - 6.949220462 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) + 20,277,498,382 cycles # 2.870 GHz + 61,179,962,230 instructions # 3.02 insn per cycle + 7.067219454 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1285) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.689813e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.736104e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.736104e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.581110e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.625265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.625265e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.516734 sec +TOTAL : 3.599791 sec INFO: No Floating Point Exceptions have been reported - 10,083,295,126 cycles # 2.864 GHz - 30,097,719,684 instructions # 2.98 insn per cycle - 3.521023820 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) + 10,346,310,423 cycles # 2.871 GHz + 30,542,909,720 instructions # 2.95 insn per cycle + 3.604137951 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.780572e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.943341e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.943341e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.602120e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.754758e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.754758e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.889615 sec +TOTAL : 1.928065 sec INFO: No Floating Point Exceptions have been reported - 5,024,798,861 cycles # 2.654 GHz - 11,482,219,428 instructions # 2.29 insn per cycle - 1.893950854 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) + 5,154,340,530 cycles # 2.668 GHz + 11,880,615,676 instructions # 2.30 insn per cycle + 1.932456074 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4893) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.644667e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.830260e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.830260e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.479310e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.664048e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.664048e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.722143 sec +TOTAL : 1.751853 sec INFO: No Floating Point Exceptions have been reported - 4,588,199,336 cycles # 2.659 GHz - 10,809,611,838 instructions # 2.36 insn per cycle - 1.726402099 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) + 4,683,675,227 cycles # 2.668 GHz + 11,173,952,183 instructions # 2.39 insn per cycle + 1.756187164 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4520) (512y: 238) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.668589e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.758731e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.758731e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.604900e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.695150e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.695150e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.482767 sec +TOTAL : 2.505893 sec INFO: No Floating Point Exceptions have been reported - 4,227,913,144 cycles # 1.701 GHz - 6,273,317,964 instructions # 1.48 insn per cycle - 2.486971348 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) + 4,267,946,917 cycles # 1.701 GHz + 6,413,270,989 instructions # 1.50 insn per cycle + 2.510099754 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2037) (512y: 163) (512z: 3730) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 06aa0981a7..bae1743dda 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:56:21 +DATE: 2024-06-03_18:03:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.484170e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.510124e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512593e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.448075e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.480692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483372e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529714 sec +TOTAL : 0.534727 sec INFO: No Floating Point Exceptions have been reported - 2,187,150,016 cycles # 2.828 GHz - 3,407,204,108 instructions # 1.56 insn per cycle - 0.831981040 seconds time elapsed + 2,160,892,959 cycles # 2.821 GHz + 3,385,854,093 instructions # 1.57 insn per cycle + 0.824886873 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.126702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.160633e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.162012e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.128072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.161312e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162714e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.045826 sec +TOTAL : 3.052385 sec INFO: No Floating Point Exceptions have been reported - 9,422,847,840 cycles # 2.851 GHz - 20,052,737,736 instructions # 2.13 insn per cycle - 3.360343374 seconds time elapsed + 9,488,215,870 cycles # 2.858 GHz + 20,102,748,501 instructions # 2.12 insn per cycle + 3.378618806 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.837101e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837996e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837996e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.833875e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.834753e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.834753e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.937233 sec +TOTAL : 8.952951 sec INFO: No Floating Point Exceptions have been reported - 25,623,864,923 cycles # 2.867 GHz - 78,942,890,669 instructions # 3.08 insn per cycle - 8.941589016 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,692,730,603 cycles # 2.869 GHz + 78,963,630,993 instructions # 3.07 insn per cycle + 8.957178674 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.527505e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.530758e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.530758e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.418565e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.421561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.421561e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.659217 sec +TOTAL : 4.806739 sec INFO: No Floating Point Exceptions have been reported - 12,887,852,449 cycles # 2.765 GHz - 39,283,888,678 instructions # 3.05 insn per cycle - 4.663553220 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,124,597,322 cycles # 2.729 GHz + 39,567,241,453 instructions # 3.01 insn per cycle + 4.811058536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.819266e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.834708e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.834708e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.807184e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.822253e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.822253e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.106980 sec +TOTAL : 2.109980 sec INFO: No Floating Point Exceptions have been reported - 5,581,493,843 cycles # 2.645 GHz - 13,685,869,165 instructions # 2.45 insn per cycle - 2.111397973 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,625,324,473 cycles # 2.662 GHz + 13,831,103,677 instructions # 2.46 insn per cycle + 2.114152989 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.940031e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.960919e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.960919e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.917724e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.938311e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.938311e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.844023 sec +TOTAL : 1.848430 sec INFO: No Floating Point Exceptions have been reported - 4,890,407,622 cycles # 2.647 GHz - 12,340,850,912 instructions # 2.52 insn per cycle - 1.848367657 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,937,289,641 cycles # 2.666 GHz + 12,512,672,274 instructions # 2.53 insn per cycle + 1.852715240 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.735647e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.747350e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.747350e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.698211e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.709522e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.709522e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.444477 sec +TOTAL : 2.458283 sec INFO: No Floating Point Exceptions have been reported - 4,109,625,734 cycles # 1.679 GHz - 6,334,694,015 instructions # 1.54 insn per cycle - 2.448820329 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,149,503,000 cycles # 1.686 GHz + 6,398,619,235 instructions # 1.54 insn per cycle + 2.462674392 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 638dc04e22..591f3c3a40 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:29:17 +DATE: 2024-06-03_18:26:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.091974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.434049e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.434049e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.120114e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465842e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465842e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520202 sec +TOTAL : 0.523164 sec INFO: No Floating Point Exceptions have been reported - 2,115,997,044 cycles # 2.821 GHz - 3,356,177,989 instructions # 1.59 insn per cycle - 0.810142197 seconds time elapsed + 2,123,157,197 cycles # 2.820 GHz + 3,317,966,729 instructions # 1.56 insn per cycle + 0.813410467 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.632273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.129586e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.129586e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.633710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.125236e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.322225 sec +TOTAL : 3.321653 sec INFO: No Floating Point Exceptions have been reported - 10,271,397,573 cycles # 2.856 GHz - 22,004,537,141 instructions # 2.14 insn per cycle - 3.652772092 seconds time elapsed + 10,250,803,498 cycles # 2.854 GHz + 21,387,097,503 instructions # 2.09 insn per cycle + 3.648104166 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.833131e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.834044e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.834044e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.828668e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.829556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.829556e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.960944 sec +TOTAL : 8.983076 sec INFO: No Floating Point Exceptions have been reported - 25,662,418,157 cycles # 2.863 GHz - 78,944,265,965 instructions # 3.08 insn per cycle - 8.965544443 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,767,042,680 cycles # 2.867 GHz + 78,969,761,034 instructions # 3.06 insn per cycle + 8.987670049 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.536360e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.539829e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.539829e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.417531e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.420733e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.420733e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.652216 sec +TOTAL : 4.812631 sec INFO: No Floating Point Exceptions have been reported - 12,900,905,409 cycles # 2.771 GHz - 39,296,118,040 instructions # 3.05 insn per cycle - 4.656875062 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,140,367,247 cycles # 2.728 GHz + 39,580,393,488 instructions # 3.01 insn per cycle + 4.817209290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.851670e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.867737e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.867737e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.785368e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.801748e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.801748e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.103024 sec +TOTAL : 2.120902 sec INFO: No Floating Point Exceptions have been reported - 5,594,201,816 cycles # 2.655 GHz - 13,697,712,232 instructions # 2.45 insn per cycle - 2.107624615 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,646,935,532 cycles # 2.658 GHz + 13,843,170,636 instructions # 2.45 insn per cycle + 2.125599955 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.921358e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.943578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.943578e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.853249e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.874783e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.874783e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.852398 sec +TOTAL : 1.866411 sec INFO: No Floating Point Exceptions have been reported - 4,909,478,389 cycles # 2.645 GHz - 12,351,405,876 instructions # 2.52 insn per cycle - 1.857024744 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,959,147,596 cycles # 2.652 GHz + 12,524,330,168 instructions # 2.53 insn per cycle + 1.870952591 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.741453e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.753934e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.753934e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.744564e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.757032e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.757032e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.447091 sec +TOTAL : 2.446868 sec INFO: No Floating Point Exceptions have been reported - 4,126,055,402 cycles # 1.684 GHz - 6,345,698,997 instructions # 1.54 insn per cycle - 2.451723990 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,167,603,212 cycles # 1.701 GHz + 6,410,053,608 instructions # 1.54 insn per cycle + 2.451645185 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 79d60d2a9e..65f6877a1f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:39:07 +DATE: 2024-06-03_18:36:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.458071e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.487880e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.459269e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.484247e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.486579e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.516495 sec +TOTAL : 0.512560 sec INFO: No Floating Point Exceptions have been reported - 2,100,340,953 cycles # 2.817 GHz - 3,318,240,864 instructions # 1.58 insn per cycle - 0.807243807 seconds time elapsed + 2,110,715,830 cycles # 2.825 GHz + 3,341,378,210 instructions # 1.58 insn per cycle + 0.806813764 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.151628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181415e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182666e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.128324e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157006e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.158214e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.126580 sec +TOTAL : 3.135265 sec INFO: No Floating Point Exceptions have been reported - 9,651,663,769 cycles # 2.855 GHz - 21,411,369,286 instructions # 2.22 insn per cycle - 3.436400390 seconds time elapsed + 9,696,524,262 cycles # 2.859 GHz + 21,940,401,561 instructions # 2.26 insn per cycle + 3.448862398 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838040e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.838971e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.838971e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832359e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.833217e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.833217e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.934375 sec +TOTAL : 8.961711 sec INFO: No Floating Point Exceptions have been reported - 25,608,373,833 cycles # 2.865 GHz - 78,937,451,677 instructions # 3.08 insn per cycle - 8.938566207 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,704,324,268 cycles # 2.867 GHz + 78,963,978,235 instructions # 3.07 insn per cycle + 8.965873340 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.525117e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.528335e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.528335e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.407652e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410676e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.663427 sec +TOTAL : 4.823773 sec INFO: No Floating Point Exceptions have been reported - 12,892,979,270 cycles # 2.763 GHz - 39,279,722,056 instructions # 3.05 insn per cycle - 4.667648484 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,137,198,616 cycles # 2.722 GHz + 39,569,230,845 instructions # 3.01 insn per cycle + 4.827968795 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.833327e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.849001e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.849001e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.808509e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.824569e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.824569e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.104949 sec +TOTAL : 2.111587 sec INFO: No Floating Point Exceptions have been reported - 5,585,628,818 cycles # 2.649 GHz - 13,686,707,794 instructions # 2.45 insn per cycle - 2.109194961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,632,091,917 cycles # 2.663 GHz + 13,830,351,821 instructions # 2.46 insn per cycle + 2.115903971 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.959882e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.980659e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.980659e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.875781e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.896132e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.896132e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.842075 sec +TOTAL : 1.859252 sec INFO: No Floating Point Exceptions have been reported - 4,892,059,435 cycles # 2.651 GHz - 12,339,041,510 instructions # 2.52 insn per cycle - 1.846337430 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,941,133,634 cycles # 2.653 GHz + 12,511,050,210 instructions # 2.53 insn per cycle + 1.863775304 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.749292e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.761080e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.761080e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.689168e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.701153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.701153e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.441489 sec +TOTAL : 2.463375 sec INFO: No Floating Point Exceptions have been reported - 4,113,508,290 cycles # 1.683 GHz - 6,332,907,864 instructions # 1.54 insn per cycle - 2.445760630 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,157,918,797 cycles # 1.686 GHz + 6,398,953,341 instructions # 1.54 insn per cycle + 2.467565422 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 5745d06e17..1adc1b429a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:36:19 +DATE: 2024-06-03_18:33:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.461663e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.488298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490692e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.490991e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.516812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.519478e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.512096 sec +TOTAL : 0.511721 sec INFO: No Floating Point Exceptions have been reported - 2,118,130,475 cycles # 2.818 GHz - 3,288,416,689 instructions # 1.55 insn per cycle - 0.809966158 seconds time elapsed + 2,115,984,574 cycles # 2.820 GHz + 3,347,070,767 instructions # 1.58 insn per cycle + 0.810000053 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134536e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164023e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165296e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.120312e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.148695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.149900e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.077228 sec +TOTAL : 3.081960 sec INFO: No Floating Point Exceptions have been reported - 9,556,153,983 cycles # 2.858 GHz - 21,726,674,576 instructions # 2.27 insn per cycle - 3.399985544 seconds time elapsed + 9,549,075,862 cycles # 2.861 GHz + 21,937,059,442 instructions # 2.30 insn per cycle + 3.392837005 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838183e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839050e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839050e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.839428e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.840299e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.840299e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.931733 sec +TOTAL : 8.925724 sec INFO: No Floating Point Exceptions have been reported - 25,607,680,070 cycles # 2.866 GHz - 78,937,604,302 instructions # 3.08 insn per cycle - 8.935953129 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,698,291,007 cycles # 2.878 GHz + 78,968,762,593 instructions # 3.07 insn per cycle + 8.929919118 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.531271e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534495e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534495e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.420392e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.423509e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.423509e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.653724 sec +TOTAL : 4.804169 sec INFO: No Floating Point Exceptions have been reported - 12,891,706,587 cycles # 2.769 GHz - 39,279,955,585 instructions # 3.05 insn per cycle - 4.658078583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,120,732,392 cycles # 2.730 GHz + 39,569,936,739 instructions # 3.02 insn per cycle + 4.808564438 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.750526e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.765709e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.765709e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.723476e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.738556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.738556e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.125753 sec +TOTAL : 2.133009 sec INFO: No Floating Point Exceptions have been reported - 5,636,716,298 cycles # 2.647 GHz - 13,685,667,157 instructions # 2.43 insn per cycle - 2.130052622 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,628,049,025 cycles # 2.634 GHz + 13,831,258,234 instructions # 2.46 insn per cycle + 2.137317610 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.975427e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.996737e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.996737e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.911711e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.931877e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.931877e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.836722 sec +TOTAL : 1.849874 sec INFO: No Floating Point Exceptions have been reported - 4,887,653,189 cycles # 2.656 GHz - 12,340,725,033 instructions # 2.52 insn per cycle - 1.841006928 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,937,906,884 cycles # 2.664 GHz + 12,512,636,347 instructions # 2.53 insn per cycle + 1.854242976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.741757e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.753368e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.753368e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.695018e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.706430e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.706430e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.442506 sec +TOTAL : 2.459636 sec INFO: No Floating Point Exceptions have been reported - 4,119,197,476 cycles # 1.684 GHz - 6,334,707,467 instructions # 1.54 insn per cycle - 2.446806756 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,146,970,086 cycles # 1.684 GHz + 6,398,911,395 instructions # 1.54 insn per cycle + 2.463931312 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 845fe92d47..be059bacfa 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:33:36 +DATE: 2024-06-03_18:30:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.175037e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.487201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490172e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.179061e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.492181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494484e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519308 sec +TOTAL : 0.516700 sec INFO: No Floating Point Exceptions have been reported - 2,132,091,270 cycles # 2.847 GHz - 3,400,172,905 instructions # 1.59 insn per cycle - 0.809359327 seconds time elapsed + 2,106,227,613 cycles # 2.820 GHz + 3,284,065,817 instructions # 1.56 insn per cycle + 0.806223623 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -70,15 +70,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.733086e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.182437e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.183712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.730205e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176852e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178090e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.211390 sec +TOTAL : 3.209108 sec INFO: No Floating Point Exceptions have been reported - 9,891,188,972 cycles # 2.857 GHz - 21,285,655,080 instructions # 2.15 insn per cycle - 3.520480517 seconds time elapsed + 9,879,829,969 cycles # 2.855 GHz + 22,117,916,176 instructions # 2.24 insn per cycle + 3.518614714 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -99,16 +99,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838729e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839597e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839597e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.833069e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.833958e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.833958e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.929008 sec +TOTAL : 8.957266 sec INFO: No Floating Point Exceptions have been reported - 25,610,442,672 cycles # 2.867 GHz - 78,938,090,928 instructions # 3.08 insn per cycle - 8.933282594 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,705,366,952 cycles # 2.869 GHz + 78,968,969,046 instructions # 3.07 insn per cycle + 8.962060815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -127,16 +127,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.506228e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.509410e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.509410e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.422242e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.425294e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.425294e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.687492 sec +TOTAL : 4.801839 sec INFO: No Floating Point Exceptions have been reported - 12,899,492,836 cycles # 2.750 GHz - 39,283,102,027 instructions # 3.05 insn per cycle - 4.691777787 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,111,523,729 cycles # 2.729 GHz + 39,567,220,863 instructions # 3.02 insn per cycle + 4.806097144 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,16 +155,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.752261e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.768022e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.768022e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.775140e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.790396e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.790396e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.125292 sec +TOTAL : 2.119070 sec INFO: No Floating Point Exceptions have been reported - 5,579,027,054 cycles # 2.621 GHz - 13,686,176,373 instructions # 2.45 insn per cycle - 2.129518303 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,626,691,162 cycles # 2.651 GHz + 13,831,085,723 instructions # 2.46 insn per cycle + 2.123497883 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -183,16 +183,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.957937e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.978146e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.978146e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.910298e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.931005e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.931005e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.840124 sec +TOTAL : 1.850169 sec INFO: No Floating Point Exceptions have been reported - 4,887,669,768 cycles # 2.651 GHz - 12,340,977,183 instructions # 2.52 insn per cycle - 1.844385583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,938,091,387 cycles # 2.664 GHz + 12,512,642,240 instructions # 2.53 insn per cycle + 1.854518481 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -211,16 +211,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.740559e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.752226e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.752226e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.690516e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.701832e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.701832e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.443234 sec +TOTAL : 2.461169 sec INFO: No Floating Point Exceptions have been reported - 4,110,489,538 cycles # 1.680 GHz - 6,334,661,219 instructions # 1.54 insn per cycle - 2.447553100 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,150,873,146 cycles # 1.684 GHz + 6,398,872,000 instructions # 1.54 insn per cycle + 2.465432241 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index f7617fa14d..2910777a3a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:56:54 +DATE: 2024-06-03_18:03:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.465250e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491924e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494561e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.460518e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493733e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496369e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.531384 sec +TOTAL : 0.528534 sec INFO: No Floating Point Exceptions have been reported - 2,176,903,857 cycles # 2.812 GHz - 3,360,015,570 instructions # 1.54 insn per cycle - 0.832792471 seconds time elapsed + 2,182,805,282 cycles # 2.827 GHz + 3,340,889,449 instructions # 1.53 insn per cycle + 0.831226796 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.144291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.179073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180483e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.142925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.177839e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.021369 sec +TOTAL : 3.031527 sec INFO: No Floating Point Exceptions have been reported - 9,368,773,778 cycles # 2.857 GHz - 21,286,316,558 instructions # 2.27 insn per cycle - 3.335014178 seconds time elapsed + 9,413,255,455 cycles # 2.860 GHz + 21,563,433,975 instructions # 2.29 insn per cycle + 3.350618061 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.844326e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.845217e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.845217e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.839296e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.840216e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.840216e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.901599 sec +TOTAL : 8.925746 sec INFO: No Floating Point Exceptions have been reported - 25,466,417,477 cycles # 2.860 GHz - 78,709,901,314 instructions # 3.09 insn per cycle - 8.905880803 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) + 25,644,420,769 cycles # 2.872 GHz + 78,708,718,415 instructions # 3.07 insn per cycle + 8.929928283 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.439967e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.443121e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.443121e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.453191e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.456333e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.456333e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.776530 sec +TOTAL : 4.758278 sec INFO: No Floating Point Exceptions have been reported - 12,973,172,137 cycles # 2.714 GHz - 39,229,674,228 instructions # 3.02 insn per cycle - 4.780939174 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) + 13,060,724,173 cycles # 2.743 GHz + 39,458,534,044 instructions # 3.02 insn per cycle + 4.762512071 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12985) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.803639e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.819236e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.819236e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.693265e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.707995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.707995e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.110784 sec +TOTAL : 2.141153 sec INFO: No Floating Point Exceptions have been reported - 5,623,478,205 cycles # 2.660 GHz - 13,801,627,183 instructions # 2.45 insn per cycle - 2.115128076 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) + 5,709,775,869 cycles # 2.662 GHz + 13,917,563,949 instructions # 2.44 insn per cycle + 2.145429673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11610) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.797294e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.817444e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.817444e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.788217e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.807623e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.807623e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.873371 sec +TOTAL : 1.875389 sec INFO: No Floating Point Exceptions have been reported - 4,983,137,076 cycles # 2.655 GHz - 12,465,949,717 instructions # 2.50 insn per cycle - 1.877612639 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) + 4,999,929,304 cycles # 2.661 GHz + 12,609,755,396 instructions # 2.52 insn per cycle + 1.879646139 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10457) (512y: 240) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.717520e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.728878e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.728878e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.675653e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.686951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.686951e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.451074 sec +TOTAL : 2.466427 sec INFO: No Floating Point Exceptions have been reported - 4,120,290,741 cycles # 1.679 GHz - 6,458,681,411 instructions # 1.57 insn per cycle - 2.455362823 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) + 4,153,899,937 cycles # 1.682 GHz + 6,506,899,667 instructions # 1.57 insn per cycle + 2.470734551 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1774) (512y: 194) (512z: 9387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 0fe5c16438..9680f74ecf 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:19:48 +DATE: 2024-06-03_18:16:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.246479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.270244e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.272477e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.245303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.266293e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.268380e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.541806 sec +TOTAL : 0.540877 sec INFO: No Floating Point Exceptions have been reported - 2,171,468,423 cycles # 2.820 GHz - 3,387,210,535 instructions # 1.56 insn per cycle - 0.829471359 seconds time elapsed + 2,168,292,720 cycles # 2.822 GHz + 3,388,946,935 instructions # 1.56 insn per cycle + 0.827313099 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.755733e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.780823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.781841e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.760235e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786269e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.787497e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.311285 sec +TOTAL : 3.314609 sec INFO: No Floating Point Exceptions have been reported - 10,187,766,661 cycles # 2.856 GHz - 22,069,205,702 instructions # 2.17 insn per cycle - 3.622898713 seconds time elapsed + 10,192,422,594 cycles # 2.858 GHz + 22,975,782,663 instructions # 2.25 insn per cycle + 3.624626973 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.133683e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.134122e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.134122e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.102297e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.102729e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.102729e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.685051 sec +TOTAL : 39.987442 sec INFO: No Floating Point Exceptions have been reported - 113,512,735,354 cycles # 2.860 GHz - 144,824,168,290 instructions # 1.28 insn per cycle - 39.689377450 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:21353) (avx2: 0) (512y: 0) (512z: 0) + 113,516,605,337 cycles # 2.839 GHz + 144,863,586,012 instructions # 1.28 insn per cycle + 39.991768743 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21407) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.009048e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.011479e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.011479e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.998649e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.000991e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.000991e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.460357 sec +TOTAL : 5.479213 sec INFO: No Floating Point Exceptions have been reported - 14,780,198,562 cycles # 2.706 GHz - 37,576,710,982 instructions # 2.54 insn per cycle - 5.464730306 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) + 14,791,666,519 cycles # 2.698 GHz + 37,656,320,268 instructions # 2.55 insn per cycle + 5.483581524 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.168780e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.181780e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.181780e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.130527e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.143433e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143433e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.297592 sec +TOTAL : 2.309741 sec INFO: No Floating Point Exceptions have been reported - 6,127,083,636 cycles # 2.663 GHz - 13,063,845,546 instructions # 2.13 insn per cycle - 2.302025246 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) + 6,126,510,346 cycles # 2.648 GHz + 13,068,223,479 instructions # 2.13 insn per cycle + 2.314176862 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46983) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.695008e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.714356e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.714356e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.503559e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.521907e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.521907e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.895827 sec +TOTAL : 1.938323 sec INFO: No Floating Point Exceptions have been reported - 5,063,974,681 cycles # 2.666 GHz - 11,441,302,228 instructions # 2.26 insn per cycle - 1.900215464 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) + 5,136,068,700 cycles # 2.645 GHz + 11,461,772,305 instructions # 2.23 insn per cycle + 1.942656144 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40514) (512y: 285) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.958952e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.971780e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.971780e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.006827e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.019287e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.019287e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.366561 sec +TOTAL : 2.350376 sec INFO: No Floating Point Exceptions have been reported - 3,976,192,244 cycles # 1.678 GHz - 5,945,001,398 instructions # 1.50 insn per cycle - 2.371015031 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) + 3,963,784,764 cycles # 1.684 GHz + 5,935,907,800 instructions # 1.50 insn per cycle + 2.354741258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2456) (512y: 337) (512z:39348) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index eab4a6ad11..6eb1688819 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:20:56 +DATE: 2024-06-03_18:18:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.265532e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.290379e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.292594e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.268679e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.289756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.291684e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.539246 sec +TOTAL : 0.538173 sec INFO: No Floating Point Exceptions have been reported - 2,164,648,283 cycles # 2.819 GHz - 3,395,016,138 instructions # 1.57 insn per cycle - 0.825962894 seconds time elapsed + 2,184,059,608 cycles # 2.818 GHz + 3,376,427,953 instructions # 1.55 insn per cycle + 0.832264536 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.767479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.792561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.793589e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.761939e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.788694e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.789907e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.296186 sec +TOTAL : 3.298105 sec INFO: No Floating Point Exceptions have been reported - 10,182,455,226 cycles # 2.857 GHz - 22,751,947,545 instructions # 2.23 insn per cycle - 3.619847665 seconds time elapsed + 10,257,333,842 cycles # 2.884 GHz + 22,698,193,148 instructions # 2.21 insn per cycle + 3.611392054 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.098681e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.099131e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.099131e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.142882e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.143319e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.143319e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 40.023617 sec +TOTAL : 39.595989 sec INFO: No Floating Point Exceptions have been reported - 114,408,903,354 cycles # 2.859 GHz - 144,789,258,871 instructions # 1.27 insn per cycle - 40.028023083 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20719) (avx2: 0) (512y: 0) (512z: 0) + 113,429,020,583 cycles # 2.865 GHz + 144,293,514,459 instructions # 1.27 insn per cycle + 39.600246121 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21037) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140461E-004 -Relative difference = 2.8372991790910424e-07 +Avg ME (F77/C++) = 6.6266731198140450E-004 +Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.944993e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.947319e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.947319e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.929742e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.931957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.931957e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.578054 sec +TOTAL : 5.607131 sec INFO: No Floating Point Exceptions have been reported - 15,223,576,233 cycles # 2.728 GHz - 37,762,970,352 instructions # 2.48 insn per cycle - 5.582406080 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) + 15,313,512,776 cycles # 2.730 GHz + 38,398,326,248 instructions # 2.51 insn per cycle + 5.611482587 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.278833e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.292510e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.292510e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.272721e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.286026e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.286026e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.262588 sec +TOTAL : 2.265052 sec INFO: No Floating Point Exceptions have been reported - 6,007,020,457 cycles # 2.651 GHz - 12,896,115,872 instructions # 2.15 insn per cycle - 2.266904685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) + 6,015,873,656 cycles # 2.652 GHz + 12,943,956,040 instructions # 2.15 insn per cycle + 2.269278552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.679346e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.698475e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.698475e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.622078e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641542e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.641542e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.899294 sec +TOTAL : 1.911674 sec INFO: No Floating Point Exceptions have been reported - 5,094,216,811 cycles # 2.677 GHz - 11,448,333,625 instructions # 2.25 insn per cycle - 1.903608667 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) + 5,095,328,711 cycles # 2.660 GHz + 11,457,094,811 instructions # 2.25 insn per cycle + 1.916057589 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40158) (512y: 219) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.004138e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.016815e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.016815e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.994860e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007651e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.351445 sec +TOTAL : 2.354136 sec INFO: No Floating Point Exceptions have been reported - 3,952,954,226 cycles # 1.679 GHz - 5,896,992,592 instructions # 1.49 insn per cycle - 2.356043721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) + 3,958,563,795 cycles # 1.679 GHz + 5,898,244,361 instructions # 1.49 insn per cycle + 2.358438675 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38926) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index fac6650d6a..bde0297986 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:57:27 +DATE: 2024-06-03_18:04:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.356553e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.410019e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.415334e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.993391e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.048362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.053874e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487456 sec +TOTAL : 0.488917 sec INFO: No Floating Point Exceptions have been reported - 1,983,251,875 cycles # 2.820 GHz - 2,928,209,714 instructions # 1.48 insn per cycle - 0.764609113 seconds time elapsed + 1,999,590,506 cycles # 2.833 GHz + 2,960,294,533 instructions # 1.48 insn per cycle + 0.765856045 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.608392e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.688100e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.691532e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.124915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.193549e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.196589e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.725977 sec +TOTAL : 1.798975 sec INFO: No Floating Point Exceptions have been reported - 5,578,929,082 cycles # 2.849 GHz - 11,014,391,874 instructions # 1.97 insn per cycle - 2.015111587 seconds time elapsed + 5,823,539,302 cycles # 2.863 GHz + 12,401,394,438 instructions # 2.13 insn per cycle + 2.090093066 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.908177e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909097e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909097e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.845616e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.846469e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.846469e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.603316 sec +TOTAL : 8.894726 sec INFO: No Floating Point Exceptions have been reported - 24,630,404,396 cycles # 2.862 GHz - 78,128,784,942 instructions # 3.17 insn per cycle - 8.607433386 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,938,295,137 cycles # 2.803 GHz + 79,123,203,849 instructions # 3.17 insn per cycle + 8.898862244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.834941e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.847074e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.847074e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.866908e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.879125e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.879125e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.408187 sec +TOTAL : 2.396402 sec INFO: No Floating Point Exceptions have been reported - 6,475,075,186 cycles # 2.685 GHz - 20,120,578,414 instructions # 3.11 insn per cycle - 2.412367232 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,527,381,936 cycles # 2.720 GHz + 20,278,170,595 instructions # 3.11 insn per cycle + 2.400661491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.547510e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.553645e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.553645e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.565130e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.571406e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.571406e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.067919 sec +TOTAL : 1.055961 sec INFO: No Floating Point Exceptions have been reported - 2,818,351,962 cycles # 2.631 GHz - 6,988,245,481 instructions # 2.48 insn per cycle - 1.072071344 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,869,911,542 cycles # 2.709 GHz + 7,073,058,843 instructions # 2.46 insn per cycle + 1.060231350 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.757892e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.765838e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.765838e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.761782e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.769925e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.769925e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.941407 sec +TOTAL : 0.939080 sec INFO: No Floating Point Exceptions have been reported - 2,493,554,219 cycles # 2.639 GHz - 6,295,971,949 instructions # 2.52 insn per cycle - 0.945627547 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,557,200,794 cycles # 2.713 GHz + 6,411,348,860 instructions # 2.51 insn per cycle + 0.943228450 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.363379e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.368145e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.368145e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.391062e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.395939e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.395939e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.211254 sec +TOTAL : 1.187645 sec INFO: No Floating Point Exceptions have been reported - 2,046,343,979 cycles # 1.685 GHz - 3,265,913,971 instructions # 1.60 insn per cycle - 1.215412846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,079,255,997 cycles # 1.746 GHz + 3,311,166,731 instructions # 1.59 insn per cycle + 1.191833308 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index bcf7be18e6..e9aab55893 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:29:50 +DATE: 2024-06-03_18:27:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.598474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.301712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.301712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.298686e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.931151e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.931151e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.472439 sec +TOTAL : 0.480027 sec INFO: No Floating Point Exceptions have been reported - 1,958,040,860 cycles # 2.820 GHz - 2,939,762,551 instructions # 1.50 insn per cycle - 0.750896192 seconds time elapsed + 1,949,076,607 cycles # 2.814 GHz + 2,936,227,158 instructions # 1.51 insn per cycle + 0.751094441 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.273826e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.571327e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.571327e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.901552 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.950050e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.119314e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.119314e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 +TOTAL : 1.970885 sec INFO: No Floating Point Exceptions have been reported - 6,095,137,537 cycles # 2.849 GHz - 12,940,780,690 instructions # 2.12 insn per cycle - 2.195795529 seconds time elapsed + 6,318,815,618 cycles # 2.853 GHz + 13,486,857,827 instructions # 2.13 insn per cycle + 2.273531724 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -95,8 +95,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.910575e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.911506e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.911506e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.921328e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.922325e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922325e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.594991 sec +TOTAL : 8.547068 sec INFO: No Floating Point Exceptions have been reported - 24,652,141,911 cycles # 2.867 GHz - 78,137,160,167 instructions # 3.17 insn per cycle - 8.599344489 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,942,697,613 cycles # 2.917 GHz + 79,121,197,548 instructions # 3.17 insn per cycle + 8.551612978 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.898082e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.911105e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.911105e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.885373e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.897651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.897651e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.388247 sec +TOTAL : 2.392647 sec INFO: No Floating Point Exceptions have been reported - 6,478,062,029 cycles # 2.708 GHz - 20,129,777,692 instructions # 3.11 insn per cycle - 2.392637226 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,537,624,163 cycles # 2.728 GHz + 20,287,271,158 instructions # 3.10 insn per cycle + 2.396994035 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.558890e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565336e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565336e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.525564e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.531703e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531703e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.063397 sec +TOTAL : 1.086336 sec INFO: No Floating Point Exceptions have been reported - 2,829,584,006 cycles # 2.652 GHz - 6,998,429,462 instructions # 2.47 insn per cycle - 1.067880984 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,877,297,833 cycles # 2.640 GHz + 7,083,001,521 instructions # 2.46 insn per cycle + 1.090815935 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.773701e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.782227e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.782227e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.729035e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.737098e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.737098e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.935665 sec +TOTAL : 0.959489 sec INFO: No Floating Point Exceptions have been reported - 2,499,925,740 cycles # 2.661 GHz - 6,304,962,307 instructions # 2.52 insn per cycle - 0.940287567 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,542,436,755 cycles # 2.639 GHz + 6,420,635,281 instructions # 2.53 insn per cycle + 0.963863512 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.365132e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.370126e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.370126e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.342357e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.347052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.347052e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.212746 sec +TOTAL : 1.233007 sec INFO: No Floating Point Exceptions have been reported - 2,057,028,781 cycles # 1.691 GHz - 3,276,379,459 instructions # 1.59 insn per cycle - 1.217212255 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,091,037,827 cycles # 1.691 GHz + 3,321,502,227 instructions # 1.59 insn per cycle + 1.237437594 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index b890671a07..dc0cb0757b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:39:40 +DATE: 2024-06-03_18:37:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.323640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.375162e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.380930e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.470190 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.987947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.031953e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.037166e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 +TOTAL : 0.476174 sec INFO: No Floating Point Exceptions have been reported - 1,954,941,965 cycles # 2.821 GHz - 2,911,153,946 instructions # 1.49 insn per cycle - 0.750497586 seconds time elapsed + 1,936,582,609 cycles # 2.812 GHz + 2,924,836,850 instructions # 1.51 insn per cycle + 0.746067574 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.584082e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.652641e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.655733e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.198293e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.256948e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.259713e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.807227 sec +TOTAL : 1.871961 sec INFO: No Floating Point Exceptions have been reported - 5,802,713,802 cycles # 2.849 GHz - 11,535,404,543 instructions # 1.99 insn per cycle - 2.092606216 seconds time elapsed + 5,971,916,027 cycles # 2.846 GHz + 12,313,771,451 instructions # 2.06 insn per cycle + 2.156592917 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.913976e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914935e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914935e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.890245e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.891148e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.891148e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.577750 sec +TOTAL : 8.685533 sec INFO: No Floating Point Exceptions have been reported - 24,611,965,012 cycles # 2.868 GHz - 78,127,241,298 instructions # 3.17 insn per cycle - 8.581839983 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,908,953,921 cycles # 2.867 GHz + 79,116,868,705 instructions # 3.18 insn per cycle + 8.689669358 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.890077e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.902996e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.902996e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.776340e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.788373e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.788373e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.389920 sec +TOTAL : 2.429835 sec INFO: No Floating Point Exceptions have been reported - 6,479,004,824 cycles # 2.707 GHz - 20,120,753,195 instructions # 3.11 insn per cycle - 2.394143360 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,530,091,398 cycles # 2.684 GHz + 20,280,205,711 instructions # 3.11 insn per cycle + 2.433950240 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.544733e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.550971e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.550971e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.522176e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528327e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528327e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.071318 sec +TOTAL : 1.086681 sec INFO: No Floating Point Exceptions have been reported - 2,822,669,649 cycles # 2.626 GHz - 6,987,403,130 instructions # 2.48 insn per cycle - 1.075405956 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,870,476,193 cycles # 2.633 GHz + 7,070,613,855 instructions # 2.46 insn per cycle + 1.090889376 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.762670e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.770899e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.770899e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.741342e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.749550e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.749550e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.940633 sec +TOTAL : 0.951048 sec INFO: No Floating Point Exceptions have been reported - 2,495,155,242 cycles # 2.643 GHz - 6,294,152,477 instructions # 2.52 insn per cycle - 0.944657528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,535,390,814 cycles # 2.657 GHz + 6,407,801,612 instructions # 2.53 insn per cycle + 0.955161053 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.367006e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.372042e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.372042e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.340513e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.345204e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.345204e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.209697 sec +TOTAL : 1.233803 sec INFO: No Floating Point Exceptions have been reported - 2,049,421,235 cycles # 1.690 GHz - 3,264,511,946 instructions # 1.59 insn per cycle - 1.213845698 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,084,630,811 cycles # 1.685 GHz + 3,309,613,890 instructions # 1.59 insn per cycle + 1.238007402 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index d9b7ee3321..dd57b89c3f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:36:52 +DATE: 2024-06-03_18:34:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.354433e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.405954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.411991e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.977039e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.020614e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.025828e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.469295 sec +TOTAL : 0.474208 sec INFO: No Floating Point Exceptions have been reported - 1,931,447,531 cycles # 2.818 GHz - 2,857,411,874 instructions # 1.48 insn per cycle - 0.742623214 seconds time elapsed + 1,932,320,281 cycles # 2.821 GHz + 2,899,665,099 instructions # 1.50 insn per cycle + 0.743582642 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.583355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.650562e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.653652e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.198516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.257721e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.260421e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.754167 sec +TOTAL : 1.817458 sec INFO: No Floating Point Exceptions have been reported - 5,647,206,768 cycles # 2.850 GHz - 12,402,873,577 instructions # 2.20 insn per cycle - 2.038447332 seconds time elapsed + 5,848,497,460 cycles # 2.852 GHz + 12,362,000,534 instructions # 2.11 insn per cycle + 2.107105881 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.908988e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909932e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909932e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.892402e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893307e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893307e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.599963 sec +TOTAL : 8.674955 sec INFO: No Floating Point Exceptions have been reported - 24,610,286,619 cycles # 2.861 GHz - 78,133,539,217 instructions # 3.17 insn per cycle - 8.604049688 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,880,877,717 cycles # 2.867 GHz + 79,117,977,201 instructions # 3.18 insn per cycle + 8.679047102 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.879217e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.891579e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.891579e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.793441e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.805458e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.805458e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.392369 sec +TOTAL : 2.422124 sec INFO: No Floating Point Exceptions have been reported - 6,476,796,255 cycles # 2.704 GHz - 20,121,504,943 instructions # 3.11 insn per cycle - 2.396533403 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,538,684,193 cycles # 2.696 GHz + 20,279,294,211 instructions # 3.10 insn per cycle + 2.426280911 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.559523e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565982e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565982e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.535010e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.541066e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.541066e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.059899 sec +TOTAL : 1.076749 sec INFO: No Floating Point Exceptions have been reported - 2,823,568,100 cycles # 2.655 GHz - 6,988,803,220 instructions # 2.48 insn per cycle - 1.064076387 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,872,784,220 cycles # 2.660 GHz + 7,073,630,125 instructions # 2.46 insn per cycle + 1.080883194 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.763562e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.771848e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.771848e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.747305e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.755134e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.755134e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.938280 sec +TOTAL : 0.946884 sec INFO: No Floating Point Exceptions have been reported - 2,491,711,137 cycles # 2.646 GHz - 6,295,398,273 instructions # 2.53 insn per cycle - 0.942474198 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,530,495,182 cycles # 2.662 GHz + 6,410,891,269 instructions # 2.53 insn per cycle + 0.951073221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.365021e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.369781e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.369781e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.350589e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.355391e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.355391e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.209851 sec +TOTAL : 1.222830 sec INFO: No Floating Point Exceptions have been reported - 2,048,944,436 cycles # 1.689 GHz - 3,266,101,120 instructions # 1.59 insn per cycle - 1.214099702 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,079,597,483 cycles # 1.696 GHz + 3,311,319,686 instructions # 1.59 insn per cycle + 1.227064652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index ae89ba0a21..9ad1cc5540 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:34:09 +DATE: 2024-06-03_18:31:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.743567e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.402148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.408206e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.421409e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.005306e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.010390e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.471325 sec +TOTAL : 0.478335 sec INFO: No Floating Point Exceptions have been reported - 1,966,995,104 cycles # 2.818 GHz - 2,861,570,291 instructions # 1.45 insn per cycle - 0.755170556 seconds time elapsed + 1,939,991,217 cycles # 2.815 GHz + 2,927,824,252 instructions # 1.51 insn per cycle + 0.747853851 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -70,15 +70,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.483014e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.687507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.690776e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.838348 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.141822e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.222275e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.224942e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 +TOTAL : 1.901980 sec INFO: No Floating Point Exceptions have been reported - 5,879,672,141 cycles # 2.848 GHz - 11,762,772,012 instructions # 2.00 insn per cycle - 2.123420033 seconds time elapsed + 6,075,195,260 cycles # 2.850 GHz + 12,958,083,094 instructions # 2.13 insn per cycle + 2.187986875 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -86,8 +86,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -99,16 +99,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.911656e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912578e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.900045e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900956e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900956e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.587423 sec +TOTAL : 8.639797 sec INFO: No Floating Point Exceptions have been reported - 24,606,124,860 cycles # 2.865 GHz - 78,133,915,634 instructions # 3.18 insn per cycle - 8.591534118 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,862,319,107 cycles # 2.877 GHz + 79,117,882,595 instructions # 3.18 insn per cycle + 8.644019221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -116,8 +116,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -127,16 +127,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.884793e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897138e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.897138e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.841326e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.853653e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.853653e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.390128 sec +TOTAL : 2.405532 sec INFO: No Floating Point Exceptions have been reported - 6,472,493,425 cycles # 2.704 GHz - 20,120,111,462 instructions # 3.11 insn per cycle - 2.394291914 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,525,406,357 cycles # 2.709 GHz + 20,278,336,627 instructions # 3.11 insn per cycle + 2.409634043 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -144,8 +144,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -155,16 +155,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.539420e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.545531e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.545531e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.529972e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.535987e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.535987e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.073549 sec +TOTAL : 1.080386 sec INFO: No Floating Point Exceptions have been reported - 2,823,771,290 cycles # 2.622 GHz - 6,988,702,898 instructions # 2.47 insn per cycle - 1.077710790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,870,228,383 cycles # 2.648 GHz + 7,072,925,245 instructions # 2.46 insn per cycle + 1.084779868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -172,8 +172,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -183,16 +183,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.753139e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.761048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.761048e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.740540e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.748411e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.748411e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.944047 sec +TOTAL : 0.950693 sec INFO: No Floating Point Exceptions have been reported - 2,494,675,517 cycles # 2.633 GHz - 6,296,230,342 instructions # 2.52 insn per cycle - 0.948274903 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,529,265,476 cycles # 2.651 GHz + 6,410,644,765 instructions # 2.53 insn per cycle + 0.954844718 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -200,8 +200,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -211,16 +211,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.362634e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.367367e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367367e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.342143e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.346738e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.346738e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.211984 sec +TOTAL : 1.230394 sec INFO: No Floating Point Exceptions have been reported - 2,049,642,332 cycles # 1.687 GHz - 3,266,281,603 instructions # 1.59 insn per cycle - 1.216098332 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,079,600,495 cycles # 1.686 GHz + 3,311,340,203 instructions # 1.59 insn per cycle + 1.234597523 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -228,8 +228,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 2894e34cf4..4b1379d26e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:57:53 +DATE: 2024-06-03_18:04:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.333854e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.386711e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.394716e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.944277e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.998202e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.004125e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.491038 sec +TOTAL : 0.491300 sec INFO: No Floating Point Exceptions have been reported - 1,996,644,525 cycles # 2.816 GHz - 2,959,911,635 instructions # 1.48 insn per cycle - 0.767468034 seconds time elapsed + 2,027,457,518 cycles # 2.855 GHz + 3,013,938,753 instructions # 1.49 insn per cycle + 0.770346504 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.597534e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.679033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.682352e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.155146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.224565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.227706e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.726608 sec +TOTAL : 1.795306 sec INFO: No Floating Point Exceptions have been reported - 5,570,117,632 cycles # 2.845 GHz - 11,507,962,990 instructions # 2.07 insn per cycle - 2.016820610 seconds time elapsed + 5,836,009,627 cycles # 2.860 GHz + 11,928,682,704 instructions # 2.04 insn per cycle + 2.097501087 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.915894e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.916848e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916848e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.894741e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.895642e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.895642e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.568155 sec +TOTAL : 8.664226 sec INFO: No Floating Point Exceptions have been reported - 24,541,635,551 cycles # 2.863 GHz - 77,860,582,476 instructions # 3.17 insn per cycle - 8.572306446 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) + 24,819,484,072 cycles # 2.864 GHz + 78,852,465,652 instructions # 3.18 insn per cycle + 8.668329453 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3106) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866268634797E-004 -Relative difference = 5.630135835748959e-08 +Avg ME (F77/C++) = 6.6274866250177339E-004 +Relative difference = 5.65798569465384e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.989453e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.002228e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.002228e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.915710e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.928145e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.928145e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.354308 sec +TOTAL : 2.379658 sec INFO: No Floating Point Exceptions have been reported - 6,429,555,449 cycles # 2.727 GHz - 20,085,437,100 instructions # 3.12 insn per cycle - 2.358577961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) + 6,479,109,742 cycles # 2.719 GHz + 20,237,907,077 instructions # 3.12 insn per cycle + 2.383835406 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13507) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861465384638E-004 -Relative difference = 2.211071647257023e-08 +Avg ME (F77/C++) = 6.6274861448331612E-004 +Relative difference = 2.1853408865157068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.495665e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.501413e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.501413e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.473012e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.478589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.478589e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.104553 sec +TOTAL : 1.121480 sec INFO: No Floating Point Exceptions have been reported - 2,915,650,989 cycles # 2.631 GHz - 7,129,883,095 instructions # 2.45 insn per cycle - 1.108735215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) + 2,980,057,411 cycles # 2.650 GHz + 7,214,022,820 instructions # 2.42 insn per cycle + 1.125580830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12458) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (F77/C++) = 6.6271939668088170E-004 +Relative difference = 5.008331292535666e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.680175e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.687445e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.687445e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.689100e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.696678e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.696678e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.984092 sec +TOTAL : 0.978913 sec INFO: No Floating Point Exceptions have been reported - 2,594,901,147 cycles # 2.627 GHz - 6,438,491,817 instructions # 2.48 insn per cycle - 0.988307717 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) + 2,616,026,965 cycles # 2.663 GHz + 6,551,988,105 instructions # 2.50 insn per cycle + 0.983164160 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11478) (512y: 26) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (F77/C++) = 6.6271939668088170E-004 +Relative difference = 5.008331292535666e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.316604e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.321189e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321189e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.298030e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.302319e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.302319e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.253756 sec +TOTAL : 1.271823 sec INFO: No Floating Point Exceptions have been reported - 2,120,187,945 cycles # 1.687 GHz - 3,427,717,458 instructions # 1.62 insn per cycle - 1.258078621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2924) (512y: 22) (512z: 9654) + 2,150,298,528 cycles # 1.687 GHz + 3,469,612,781 instructions # 1.61 insn per cycle + 1.275947754 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3051) (512y: 25) (512z: 9681) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952032322112E-004 -Relative difference = 3.066639970473621e-08 +Avg ME (F77/C++) = 6.6271952032316561E-004 +Relative difference = 3.066631594207157e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index f9728316f5..9a7511ccb1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:22:03 +DATE: 2024-06-03_18:19:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.562593e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.602761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.607258e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.496319 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.040325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.086216e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.091080e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.493370 sec INFO: No Floating Point Exceptions have been reported - 2,073,990,194 cycles # 2.815 GHz - 3,053,942,926 instructions # 1.47 insn per cycle - 0.794515563 seconds time elapsed + 1,994,267,279 cycles # 2.817 GHz + 2,950,946,274 instructions # 1.48 insn per cycle + 0.766690512 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.711536e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.769520e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.772095e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.679705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.740696e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.743534e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.869015 sec +TOTAL : 1.728122 sec INFO: No Floating Point Exceptions have been reported - 6,036,319,674 cycles # 2.845 GHz - 12,339,909,244 instructions # 2.04 insn per cycle - 2.177861676 seconds time elapsed + 5,590,233,725 cycles # 2.853 GHz + 11,001,068,330 instructions # 1.97 insn per cycle + 2.016085871 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +Avg ME (F77/GPU) = 6.6262669162351490E-004 +Relative difference = 2.8232862531213374e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.442190e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.442945e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.442945e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.326582e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.327296e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.327296e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 30.143467 sec +TOTAL : 30.797428 sec INFO: No Floating Point Exceptions have been reported - 86,238,493,092 cycles # 2.861 GHz - 135,582,429,521 instructions # 1.57 insn per cycle - 30.147691683 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15593) (avx2: 0) (512y: 0) (512z: 0) + 88,271,755,592 cycles # 2.866 GHz + 135,713,283,036 instructions # 1.54 insn per cycle + 30.801639573 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15654) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275351196781740E-004 -Relative difference = 1.805772034719401e-08 +Avg ME (F77/C++) = 6.6275351083142087E-004 +Relative difference = 1.6343060926412837e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.622664e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.634101e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.634101e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.650422e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.662101e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.662101e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.485418 sec +TOTAL : 2.475028 sec INFO: No Floating Point Exceptions have been reported - 6,780,106,144 cycles # 2.725 GHz - 19,386,070,044 instructions # 2.86 insn per cycle - 2.489679561 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) + 6,775,368,084 cycles # 2.734 GHz + 19,365,438,660 instructions # 2.86 insn per cycle + 2.479304920 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862707273868E-004 -Relative difference = 4.0849182767952624e-08 +Avg ME (F77/C++) = 6.6274862748188362E-004 +Relative difference = 4.14665283800746e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.375981e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.380844e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.380844e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.200183 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.379846e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.384758e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.384758e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.197144 sec INFO: No Floating Point Exceptions have been reported - 3,187,701,871 cycles # 2.648 GHz - 6,807,898,152 instructions # 2.14 insn per cycle - 1.204433728 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) + 3,171,868,252 cycles # 2.642 GHz + 6,800,239,710 instructions # 2.14 insn per cycle + 1.201396205 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49016) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (F77/C++) = 6.6272731568543797E-004 +Relative difference = 2.3668012430631962e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667985e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.675164e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.675164e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.991406 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.672800e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.680200e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.680200e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 0.988606 sec INFO: No Floating Point Exceptions have been reported - 2,635,968,315 cycles # 2.649 GHz - 5,985,925,835 instructions # 2.27 insn per cycle - 0.995667999 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) + 2,634,024,890 cycles # 2.655 GHz + 5,977,125,707 instructions # 2.27 insn per cycle + 0.992732467 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42613) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (F77/C++) = 6.6272731568543797E-004 +Relative difference = 2.3668012430631962e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.343303e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.347932e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.347932e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.229182 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.340599e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.345196e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.345196e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.232185 sec INFO: No Floating Point Exceptions have been reported - 2,077,787,049 cycles # 1.685 GHz - 3,500,922,258 instructions # 1.68 insn per cycle - 1.233472370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5210) (512y: 3) (512z:44829) + 2,080,640,847 cycles # 1.685 GHz + 3,501,935,156 instructions # 1.68 insn per cycle + 1.236390991 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5210) (512y: 3) (512z:44834) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750363879224E-004 -Relative difference = 5.490631193034436e-09 +Avg ME (F77/C++) = 6.6272750237027223E-004 +Relative difference = 3.5765412974815996e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index a6e1efe771..b7d395e1d4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:22:53 +DATE: 2024-06-03_18:20:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.474201e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.511992e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.516380e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.495445 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.115525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.158112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.163416e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.488524 sec INFO: No Floating Point Exceptions have been reported - 2,069,875,918 cycles # 2.815 GHz - 3,088,038,049 instructions # 1.49 insn per cycle - 0.793826258 seconds time elapsed + 1,995,967,010 cycles # 2.825 GHz + 2,980,745,793 instructions # 1.49 insn per cycle + 0.765930733 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.633274e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.688888e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.691415e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.783687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.846234e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.849143e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.883548 sec +TOTAL : 1.714935 sec INFO: No Floating Point Exceptions have been reported - 6,034,409,459 cycles # 2.847 GHz - 12,602,199,510 instructions # 2.09 insn per cycle - 2.178280060 seconds time elapsed + 5,548,221,436 cycles # 2.845 GHz + 11,425,921,065 instructions # 2.06 insn per cycle + 2.006998021 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +Avg ME (F77/GPU) = 6.6262669162351490E-004 +Relative difference = 2.8232862531213374e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.435127e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.435900e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.435900e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.490308e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.491091e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.491091e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 30.183033 sec +TOTAL : 29.879379 sec INFO: No Floating Point Exceptions have been reported - 86,348,373,585 cycles # 2.861 GHz - 135,991,147,369 instructions # 1.57 insn per cycle - 30.187286457 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15571) (avx2: 0) (512y: 0) (512z: 0) + 85,803,548,967 cycles # 2.872 GHz + 135,586,619,473 instructions # 1.58 insn per cycle + 29.883595944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275348988418387E-004 -Relative difference = 1.5263316105958472e-08 +Avg ME (F77/C++) = 6.6275346699767868E-004 +Relative difference = 4.979577076821206e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.576661e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588266e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588266e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.541123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.552430e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.552430e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.501741 sec +TOTAL : 2.515548 sec INFO: No Floating Point Exceptions have been reported - 6,860,063,616 cycles # 2.739 GHz - 19,439,732,968 instructions # 2.83 insn per cycle - 2.505990169 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) + 6,837,292,011 cycles # 2.714 GHz + 19,414,639,291 instructions # 2.84 insn per cycle + 2.519892605 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69633) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862764021530E-004 -Relative difference = 4.170542995014107e-08 +Avg ME (F77/C++) = 6.6274862799683282E-004 +Relative difference = 4.2243518621014775e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.407660e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.412779e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.412779e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.173483 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.406201e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.411445e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.411445e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.174301 sec INFO: No Floating Point Exceptions have been reported - 3,113,664,715 cycles # 2.645 GHz - 6,718,777,649 instructions # 2.16 insn per cycle - 1.177759140 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) + 3,108,934,296 cycles # 2.640 GHz + 6,722,953,423 instructions # 2.16 insn per cycle + 1.178539821 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47703) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (F77/C++) = 6.6272731623419345E-004 +Relative difference = 2.449603850635964e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667132e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.674306e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.674306e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.991992 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.665805e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.673199e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.673199e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 0.992561 sec INFO: No Floating Point Exceptions have been reported - 2,638,087,053 cycles # 2.650 GHz - 5,969,912,308 instructions # 2.26 insn per cycle - 0.996231534 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) + 2,633,671,965 cycles # 2.644 GHz + 5,976,623,304 instructions # 2.27 insn per cycle + 0.996855743 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41894) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (F77/C++) = 6.6272731623419345E-004 +Relative difference = 2.449603850635964e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.342386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.347018e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.347018e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.229963 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.344162e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.348769e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.348769e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.228300 sec INFO: No Floating Point Exceptions have been reported - 2,078,559,427 cycles # 1.687 GHz - 3,494,531,487 instructions # 1.68 insn per cycle - 1.234278676 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4174) (512y: 4) (512z:44472) + 2,078,335,327 cycles # 1.687 GHz + 3,494,575,133 instructions # 1.68 insn per cycle + 1.232506015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4174) (512y: 4) (512z:44485) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750384530066E-004 -Relative difference = 5.80223501432476e-09 +Avg ME (F77/C++) = 6.6272750247886592E-004 +Relative difference = 3.740400032174438e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 7c14a2e7fb..ac0969612a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:58:18 +DATE: 2024-06-03_18:05:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.482581e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.509636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512012e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.453427e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.486035e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.488786e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527538 sec +TOTAL : 0.536243 sec INFO: No Floating Point Exceptions have been reported - 2,176,696,799 cycles # 2.823 GHz - 3,403,965,501 instructions # 1.56 insn per cycle - 0.831403987 seconds time elapsed + 2,164,059,676 cycles # 2.821 GHz + 3,333,455,852 instructions # 1.54 insn per cycle + 0.826098229 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.140636e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.174184e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.175593e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.145692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180551e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.042724 sec +TOTAL : 3.039539 sec INFO: No Floating Point Exceptions have been reported - 9,426,429,638 cycles # 2.856 GHz - 21,229,330,812 instructions # 2.25 insn per cycle - 3.355690618 seconds time elapsed + 9,431,161,386 cycles # 2.859 GHz + 21,152,322,108 instructions # 2.24 insn per cycle + 3.354444882 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.825999e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826873e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826873e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.813946e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814806e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814806e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.991087 sec +TOTAL : 9.050488 sec INFO: No Floating Point Exceptions have been reported - 25,893,198,103 cycles # 2.879 GHz - 79,438,485,543 instructions # 3.07 insn per cycle - 8.995289929 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) + 25,976,132,367 cycles # 2.869 GHz + 79,434,004,937 instructions # 3.06 insn per cycle + 9.054767785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4789) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.417587e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.420701e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.420701e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.393355e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.396331e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.396331e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.808506 sec +TOTAL : 4.842609 sec INFO: No Floating Point Exceptions have been reported - 12,725,370,972 cycles # 2.645 GHz - 38,549,760,913 instructions # 3.03 insn per cycle - 4.812856911 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) + 12,860,713,919 cycles # 2.654 GHz + 38,831,953,159 instructions # 3.02 insn per cycle + 4.846945234 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.894022e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.909715e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.909715e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.861975e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.877645e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.877645e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.087240 sec +TOTAL : 2.096124 sec INFO: No Floating Point Exceptions have been reported - 5,528,399,244 cycles # 2.644 GHz - 13,481,627,455 instructions # 2.44 insn per cycle - 2.091666528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) + 5,571,143,137 cycles # 2.654 GHz + 13,624,556,258 instructions # 2.45 insn per cycle + 2.100393193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11434) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.891097e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.911587e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.911587e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.943351e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.963560e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.963560e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.854687 sec +TOTAL : 1.843305 sec INFO: No Floating Point Exceptions have been reported - 4,870,728,509 cycles # 2.622 GHz - 12,137,042,883 instructions # 2.49 insn per cycle - 1.858964728 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) + 4,890,128,479 cycles # 2.648 GHz + 12,303,900,159 instructions # 2.52 insn per cycle + 1.847571545 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10344) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.691394e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.702803e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.702803e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.627810e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.638960e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.638960e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.460755 sec +TOTAL : 2.484305 sec INFO: No Floating Point Exceptions have been reported - 4,149,120,121 cycles # 1.684 GHz - 6,337,745,344 instructions # 1.53 insn per cycle - 2.465117818 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) + 4,186,038,092 cycles # 1.683 GHz + 6,400,000,279 instructions # 1.53 insn per cycle + 2.488700095 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1982) (512y: 93) (512z: 9359) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 1d3301fafa..91d2dbe837 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:58:52 +DATE: 2024-06-03_18:05:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.467668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.493937e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.496432e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.467855e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.503291e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527278 sec +TOTAL : 0.529906 sec INFO: No Floating Point Exceptions have been reported - 2,157,795,556 cycles # 2.825 GHz - 3,403,572,710 instructions # 1.58 insn per cycle - 0.822587991 seconds time elapsed + 2,180,833,265 cycles # 2.825 GHz + 3,396,189,647 instructions # 1.56 insn per cycle + 0.831257190 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.147774e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181635e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182999e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.143984e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177475e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178851e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.025468 sec +TOTAL : 3.032584 sec INFO: No Floating Point Exceptions have been reported - 9,383,589,958 cycles # 2.858 GHz - 21,474,425,018 instructions # 2.29 insn per cycle - 3.338591075 seconds time elapsed + 9,433,736,246 cycles # 2.856 GHz + 19,935,455,469 instructions # 2.11 insn per cycle + 3.360609480 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.819625e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.820503e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.820503e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.807867e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.808713e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.808713e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.022232 sec +TOTAL : 9.081153 sec INFO: No Floating Point Exceptions have been reported - 25,882,405,875 cycles # 2.868 GHz - 79,448,983,201 instructions # 3.07 insn per cycle - 9.026481543 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) + 26,018,917,813 cycles # 2.864 GHz + 79,465,278,181 instructions # 3.05 insn per cycle + 9.085407536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4445) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.446182e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.449327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.449327e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.392154e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.395092e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.395092e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.768299 sec +TOTAL : 4.844253 sec INFO: No Floating Point Exceptions have been reported - 12,681,708,725 cycles # 2.658 GHz - 38,523,479,653 instructions # 3.04 insn per cycle - 4.772643197 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) + 12,831,884,182 cycles # 2.647 GHz + 38,790,118,592 instructions # 3.02 insn per cycle + 4.848502841 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12946) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.783293e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.799085e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.799085e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.835313e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.851068e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.851068e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.116349 sec +TOTAL : 2.102167 sec INFO: No Floating Point Exceptions have been reported - 5,573,346,630 cycles # 2.629 GHz - 13,607,371,055 instructions # 2.44 insn per cycle - 2.120654201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) + 5,610,923,922 cycles # 2.665 GHz + 13,739,248,056 instructions # 2.45 insn per cycle + 2.106309675 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11517) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.950987e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.971335e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.971335e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.885385e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.905960e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.905960e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.841569 sec +TOTAL : 1.854910 sec INFO: No Floating Point Exceptions have been reported - 4,914,422,282 cycles # 2.663 GHz - 12,272,016,530 instructions # 2.50 insn per cycle - 1.845933307 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) + 4,944,878,602 cycles # 2.661 GHz + 12,428,707,476 instructions # 2.51 insn per cycle + 1.859196994 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10335) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.694812e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.706153e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.706153e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.620737e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.632179e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.632179e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.459293 sec +TOTAL : 2.486586 sec INFO: No Floating Point Exceptions have been reported - 4,148,774,251 cycles # 1.685 GHz - 6,442,210,372 instructions # 1.55 insn per cycle - 2.463549102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) + 4,188,153,219 cycles # 1.682 GHz + 6,501,924,669 instructions # 1.55 insn per cycle + 2.490871580 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1805) (512y: 191) (512z: 9368) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 2e640fb20e..78b5d57214 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:00:43 +DATE: 2024-06-03_18:07:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.070046e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.070489e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.070728e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.072465e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.072863e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.073011e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.432122 sec +TOTAL : 2.429567 sec INFO: No Floating Point Exceptions have been reported - 7,909,240,752 cycles # 2.872 GHz - 18,000,344,677 instructions # 2.28 insn per cycle - 2.812238946 seconds time elapsed + 7,873,742,017 cycles # 2.858 GHz + 17,189,890,626 instructions # 2.18 insn per cycle + 2.810614216 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.257630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.259727e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.259989e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.252226e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.254836e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.255128e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.998685 sec +TOTAL : 3.999219 sec INFO: No Floating Point Exceptions have been reported - 12,358,861,001 cycles # 2.857 GHz - 27,265,356,364 instructions # 2.21 insn per cycle - 4.380623068 seconds time elapsed + 12,383,566,367 cycles # 2.861 GHz + 28,539,894,752 instructions # 2.30 insn per cycle + 4.383788497 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.364227e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.364435e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.364435e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.486298e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.486497e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.486497e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.176405 sec +TOTAL : 7.044010 sec INFO: No Floating Point Exceptions have been reported - 18,821,710,602 cycles # 2.622 GHz - 53,917,723,661 instructions # 2.86 insn per cycle - 7.180812312 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) + 18,875,422,058 cycles # 2.679 GHz + 53,914,980,760 instructions # 2.86 insn per cycle + 7.048043451 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32438) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.537482e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.537565e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.537565e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.528426e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528507e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528507e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.438733 sec +TOTAL : 3.459359 sec INFO: No Floating Point Exceptions have been reported - 9,825,974,360 cycles # 2.855 GHz - 27,092,527,909 instructions # 2.76 insn per cycle - 3.442940335 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) + 9,903,492,345 cycles # 2.860 GHz + 27,159,679,932 instructions # 2.74 insn per cycle + 3.463482853 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.317063e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.317444e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.317444e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.294340e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.294743e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294743e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.596530 sec +TOTAL : 1.608857 sec INFO: No Floating Point Exceptions have been reported - 4,226,902,337 cycles # 2.642 GHz - 9,560,928,493 instructions # 2.26 insn per cycle - 1.600685064 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) + 4,251,273,738 cycles # 2.637 GHz + 9,597,252,474 instructions # 2.26 insn per cycle + 1.613007793 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84989) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.692227e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.692715e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.692715e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.710001e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.710494e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.710494e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.434526 sec +TOTAL : 1.427734 sec INFO: No Floating Point Exceptions have been reported - 3,746,125,551 cycles # 2.606 GHz - 8,486,014,947 instructions # 2.27 insn per cycle - 1.438619859 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) + 3,763,325,270 cycles # 2.630 GHz + 8,521,622,001 instructions # 2.26 insn per cycle + 1.431927276 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80643) (512y: 89) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.273656e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.274120e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.274120e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.262704e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.263213e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.263213e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.618156 sec +TOTAL : 1.623854 sec INFO: No Floating Point Exceptions have been reported - 2,695,756,195 cycles # 1.663 GHz - 4,273,774,333 instructions # 1.59 insn per cycle - 1.622321455 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) + 2,708,113,519 cycles # 1.665 GHz + 4,288,403,954 instructions # 1.58 insn per cycle + 1.627965968 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2876) (512y: 103) (512z:79119) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 1fadaabb4f..36e5ddb3e5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:30:16 +DATE: 2024-06-03_18:27:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065370e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.066381e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066381e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.069807e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.070797e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.070797e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.397754 sec +TOTAL : 2.390412 sec INFO: No Floating Point Exceptions have been reported - 7,762,497,768 cycles # 2.853 GHz - 16,203,483,295 instructions # 2.09 insn per cycle - 2.776448372 seconds time elapsed + 7,772,262,333 cycles # 2.856 GHz + 17,485,384,479 instructions # 2.25 insn per cycle + 2.777088442 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.238795e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.273940e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.273940e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.214032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.248382e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.248382e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.985494 sec +TOTAL : 3.992588 sec INFO: No Floating Point Exceptions have been reported - 12,320,118,228 cycles # 2.858 GHz - 28,700,363,846 instructions # 2.33 insn per cycle - 4.365416695 seconds time elapsed + 12,381,100,845 cycles # 2.860 GHz + 28,809,274,855 instructions # 2.33 insn per cycle + 4.384322703 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.431447e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.431652e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.431652e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.534429e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.534647e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.534647e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.108151 sec +TOTAL : 7.010505 sec INFO: No Floating Point Exceptions have been reported - 18,792,897,710 cycles # 2.643 GHz - 53,918,227,536 instructions # 2.87 insn per cycle - 7.112225621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) + 19,040,063,694 cycles # 2.716 GHz + 53,915,532,539 instructions # 2.83 insn per cycle + 7.014673205 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32438) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.546658e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546743e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546743e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.522796e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.522880e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.522880e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.418887 sec +TOTAL : 3.472375 sec INFO: No Floating Point Exceptions have been reported - 9,791,096,347 cycles # 2.861 GHz - 27,093,479,045 instructions # 2.77 insn per cycle - 3.423017015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) + 9,921,044,067 cycles # 2.854 GHz + 27,159,090,171 instructions # 2.74 insn per cycle + 3.476580021 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.304069e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.304481e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.304481e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.284550e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.284932e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.284932e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.603100 sec +TOTAL : 1.612989 sec INFO: No Floating Point Exceptions have been reported - 4,241,407,347 cycles # 2.640 GHz - 9,561,955,028 instructions # 2.25 insn per cycle - 1.607196815 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) + 4,260,178,809 cycles # 2.636 GHz + 9,599,298,178 instructions # 2.25 insn per cycle + 1.617114232 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84989) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.744578e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.745149e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.745149e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.698370e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.698859e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.698859e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.415314 sec +TOTAL : 1.433048 sec INFO: No Floating Point Exceptions have been reported - 3,737,020,554 cycles # 2.634 GHz - 8,486,765,632 instructions # 2.27 insn per cycle - 1.419451948 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) + 3,772,233,442 cycles # 2.626 GHz + 8,522,495,369 instructions # 2.26 insn per cycle + 1.437211202 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80643) (512y: 89) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.289384e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.289882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.289882e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.276746e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.277245e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.277245e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.610890 sec +TOTAL : 1.616963 sec INFO: No Floating Point Exceptions have been reported - 2,696,211,668 cycles # 1.670 GHz - 4,273,881,889 instructions # 1.59 insn per cycle - 1.615050450 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) + 2,704,381,455 cycles # 1.669 GHz + 4,289,252,981 instructions # 1.59 insn per cycle + 1.621067952 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2876) (512y: 103) (512z:79119) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index b7c9be9361..586e2a3c40 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:01:38 +DATE: 2024-06-03_18:08:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065481e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.065914e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066085e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.065479e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065879e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066080e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.431533 sec +TOTAL : 2.433636 sec INFO: No Floating Point Exceptions have been reported - 7,864,728,245 cycles # 2.855 GHz - 16,581,142,625 instructions # 2.11 insn per cycle - 2.810896011 seconds time elapsed + 7,867,649,256 cycles # 2.853 GHz + 17,930,493,406 instructions # 2.28 insn per cycle + 2.813625761 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.234334e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.236420e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.236694e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.251408e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.254022e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.254315e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.002654 sec +TOTAL : 3.999341 sec INFO: No Floating Point Exceptions have been reported - 12,362,983,148 cycles # 2.857 GHz - 26,818,544,684 instructions # 2.17 insn per cycle - 4.385714343 seconds time elapsed + 12,370,081,693 cycles # 2.859 GHz + 29,532,832,747 instructions # 2.39 insn per cycle + 4.381754418 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.360451e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.360670e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.360670e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.604921e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.605129e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.605129e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.178712 sec +TOTAL : 6.946520 sec INFO: No Floating Point Exceptions have been reported - 18,903,471,951 cycles # 2.632 GHz - 53,926,959,837 instructions # 2.85 insn per cycle - 7.182807918 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) + 18,898,258,212 cycles # 2.720 GHz + 53,941,018,950 instructions # 2.85 insn per cycle + 6.950480868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32036) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.554111e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.554204e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.554204e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.536847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.536936e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.536936e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.401831 sec +TOTAL : 3.440816 sec INFO: No Floating Point Exceptions have been reported - 9,728,814,018 cycles # 2.857 GHz - 27,089,535,875 instructions # 2.78 insn per cycle - 3.405876690 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) + 9,897,425,323 cycles # 2.874 GHz + 27,136,976,374 instructions # 2.74 insn per cycle + 3.445026034 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96387) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.279450e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.279875e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.279875e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.330836e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331235e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331235e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.614318 sec +TOTAL : 1.589285 sec INFO: No Floating Point Exceptions have been reported - 4,271,811,996 cycles # 2.641 GHz - 9,560,879,429 instructions # 2.24 insn per cycle - 1.618409778 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) + 4,264,338,826 cycles # 2.677 GHz + 9,591,625,358 instructions # 2.25 insn per cycle + 1.593576676 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84996) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.724611e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.725106e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.725106e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.750204e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.750700e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.750700e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.422019 sec +TOTAL : 1.413085 sec INFO: No Floating Point Exceptions have been reported - 3,745,973,725 cycles # 2.628 GHz - 8,485,619,535 instructions # 2.27 insn per cycle - 1.426190804 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) + 3,752,884,907 cycles # 2.650 GHz + 8,514,358,197 instructions # 2.27 insn per cycle + 1.417168539 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80666) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.244405e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.244886e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.244886e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.285502e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.285994e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.285994e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.632336 sec +TOTAL : 1.612194 sec INFO: No Floating Point Exceptions have been reported - 2,716,368,412 cycles # 1.661 GHz - 4,277,085,599 instructions # 1.57 insn per cycle - 1.636609876 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) + 2,702,142,076 cycles # 1.673 GHz + 4,289,047,207 instructions # 1.59 insn per cycle + 1.616308896 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2713) (512y: 185) (512z:79103) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 9454f64bcc..28e6c74910 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:02:33 +DATE: 2024-06-03_18:09:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.559368e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.560230e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.560569e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.282693e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.283455e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.283883e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.689610 sec +TOTAL : 1.743706 sec INFO: No Floating Point Exceptions have been reported - 5,589,146,316 cycles # 2.841 GHz - 11,119,486,865 instructions # 1.99 insn per cycle - 2.023529940 seconds time elapsed + 5,709,155,447 cycles # 2.851 GHz + 12,383,853,252 instructions # 2.17 insn per cycle + 2.059074311 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.312667e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.313583e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.313721e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.937585 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.136623e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.137393e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.137518e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 +TOTAL : 2.053767 sec INFO: No Floating Point Exceptions have been reported - 6,322,260,150 cycles # 2.852 GHz - 12,991,612,203 instructions # 2.05 insn per cycle - 2.273050569 seconds time elapsed + 6,600,822,454 cycles # 2.853 GHz + 14,650,396,107 instructions # 2.22 insn per cycle + 2.369651749 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849636e-03 -Avg ME (F77/GPU) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +Avg ME (C++/GPU) = 9.849635e-03 +Avg ME (F77/GPU) = 9.8712451931260159E-003 +Relative difference = 0.0021940095370046923 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.449095e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.449366e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.449366e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.350768e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.351017e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.351017e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.254155 sec +TOTAL : 6.328437 sec INFO: No Floating Point Exceptions have been reported - 17,924,189,515 cycles # 2.865 GHz - 53,589,289,728 instructions # 2.99 insn per cycle - 6.258212218 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) + 18,112,133,095 cycles # 2.861 GHz + 53,917,761,859 instructions # 2.98 insn per cycle + 6.332410128 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087551509E-003 +Relative difference = 2.119780432912131e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.313591e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.314004e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.314004e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.302446e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.302838e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.302838e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.598328 sec +TOTAL : 1.603618 sec INFO: No Floating Point Exceptions have been reported - 4,580,110,582 cycles # 2.860 GHz - 13,761,912,039 instructions # 3.00 insn per cycle - 1.602548280 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) + 4,595,250,388 cycles # 2.859 GHz + 13,814,198,944 instructions # 3.01 insn per cycle + 1.607761589 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:97032) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546896367235E-003 +Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.493632e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.495239e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.495239e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.461361e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.463009e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.463009e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.817743 sec +TOTAL : 0.824355 sec INFO: No Floating Point Exceptions have been reported - 2,143,262,417 cycles # 2.610 GHz - 4,816,174,375 instructions # 2.25 insn per cycle - 0.821948335 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) + 2,147,702,206 cycles # 2.594 GHz + 4,843,386,973 instructions # 2.26 insn per cycle + 0.828633688 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85515) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.535714e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.537917e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.537917e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.529713e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.531799e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.531799e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.705994 sec +TOTAL : 0.706721 sec INFO: No Floating Point Exceptions have been reported - 1,872,359,401 cycles # 2.639 GHz - 4,273,597,055 instructions # 2.28 insn per cycle - 0.710000254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) + 1,879,943,341 cycles # 2.647 GHz + 4,297,878,746 instructions # 2.29 insn per cycle + 0.710927830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81214) (512y: 44) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.586004e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.587962e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.587962e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.551582e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.553505e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.553505e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.806945 sec +TOTAL : 0.810975 sec INFO: No Floating Point Exceptions have been reported - 1,355,141,176 cycles # 1.672 GHz - 2,158,222,960 instructions # 1.59 insn per cycle - 0.811081253 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) + 1,364,375,785 cycles # 1.675 GHz + 2,169,142,049 instructions # 1.59 insn per cycle + 0.815064535 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3493) (512y: 47) (512z:79334) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (F77/C++) = 9.8929811982676284E-003 +Relative difference = 2.004124217057488e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index c3dad58c83..859b38d5c1 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:31:10 +DATE: 2024-06-03_18:28:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.586248e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588038e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588038e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.639511 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.295855e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.297495e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.297495e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187093e-05 +- 9.825663e-06 ) GeV^-6 +TOTAL : 1.706058 sec INFO: No Floating Point Exceptions have been reported - 5,458,859,849 cycles # 2.849 GHz - 11,717,497,877 instructions # 2.15 insn per cycle - 1.972700319 seconds time elapsed + 5,593,764,153 cycles # 2.850 GHz + 11,978,491,776 instructions # 2.14 insn per cycle + 2.019336398 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,24 +79,24 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.304060e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.317709e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.317709e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.924656 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.117757e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.129477e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.129477e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856440e-04 +- 8.331091e-05 ) GeV^-6 +TOTAL : 2.042960 sec INFO: No Floating Point Exceptions have been reported - 6,299,130,122 cycles # 2.862 GHz - 13,932,257,788 instructions # 2.21 insn per cycle - 2.257564113 seconds time elapsed + 6,558,229,128 cycles # 2.852 GHz + 14,467,859,228 instructions # 2.21 insn per cycle + 2.357922051 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849636e-03 -Avg ME (F77/GPU) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +Avg ME (C++/GPU) = 9.849635e-03 +Avg ME (F77/GPU) = 9.8712451931260159E-003 +Relative difference = 0.0021940095370046923 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.449386e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.449634e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.449634e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.173840e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.174090e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.174090e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.252998 sec +TOTAL : 6.464931 sec INFO: No Floating Point Exceptions have been reported - 17,934,331,616 cycles # 2.867 GHz - 53,590,587,156 instructions # 2.99 insn per cycle - 6.257054326 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) + 18,464,188,622 cycles # 2.855 GHz + 53,919,219,439 instructions # 2.92 insn per cycle + 6.469025984 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087551509E-003 +Relative difference = 2.119780432912131e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.318510e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318903e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318903e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.299445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.299838e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299838e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.596218 sec +TOTAL : 1.605494 sec INFO: No Floating Point Exceptions have been reported - 4,578,862,735 cycles # 2.862 GHz - 13,762,757,180 instructions # 3.01 insn per cycle - 1.600379529 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) + 4,600,449,973 cycles # 2.860 GHz + 13,814,867,008 instructions # 3.00 insn per cycle + 1.609640985 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:97032) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546896367235E-003 +Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.561862e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.563430e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.563430e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.565702e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.567481e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.567481e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.810097 sec +TOTAL : 0.809807 sec INFO: No Floating Point Exceptions have been reported - 2,150,908,758 cycles # 2.644 GHz - 4,817,064,263 instructions # 2.24 insn per cycle - 0.814268690 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) + 2,149,625,871 cycles # 2.644 GHz + 4,844,274,687 instructions # 2.25 insn per cycle + 0.813973066 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85515) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.557169e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.559485e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.559485e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.487567e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.489594e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.489594e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.704248 sec +TOTAL : 0.710407 sec INFO: No Floating Point Exceptions have been reported - 1,860,720,782 cycles # 2.629 GHz - 4,274,198,133 instructions # 2.30 insn per cycle - 0.708306150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) + 1,891,140,135 cycles # 2.649 GHz + 4,298,530,156 instructions # 2.27 insn per cycle + 0.714547469 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81214) (512y: 44) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.582585e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.584548e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.584548e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.527055e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.529167e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.529167e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.807471 sec +TOTAL : 0.814394 sec INFO: No Floating Point Exceptions have been reported - 1,357,966,185 cycles # 1.671 GHz - 2,159,181,276 instructions # 1.59 insn per cycle - 0.813237032 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) + 1,366,391,214 cycles # 1.671 GHz + 2,170,869,275 instructions # 1.59 insn per cycle + 0.818526272 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3493) (512y: 47) (512z:79334) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (F77/C++) = 9.8929811982676284E-003 +Relative difference = 2.004124217057488e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 5816b2c2c2..a86a223e97 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:03:12 +DATE: 2024-06-03_18:10:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.532712e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.533546e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.533906e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.288627e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.289386e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.289766e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.691126 sec +TOTAL : 1.745254 sec INFO: No Floating Point Exceptions have been reported - 5,622,580,002 cycles # 2.845 GHz - 11,510,934,287 instructions # 2.05 insn per cycle - 2.034693744 seconds time elapsed + 5,703,053,044 cycles # 2.850 GHz + 11,232,321,378 instructions # 1.97 insn per cycle + 2.059584789 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.322031e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.322933e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.323068e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.930959 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.130933e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.131711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.131834e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 +TOTAL : 2.058680 sec INFO: No Floating Point Exceptions have been reported - 6,307,622,079 cycles # 2.854 GHz - 13,926,153,465 instructions # 2.21 insn per cycle - 2.267828569 seconds time elapsed + 6,607,736,050 cycles # 2.851 GHz + 13,912,202,043 instructions # 2.11 insn per cycle + 2.376581429 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849636e-03 -Avg ME (F77/GPU) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +Avg ME (C++/GPU) = 9.849635e-03 +Avg ME (F77/GPU) = 9.8712451931260107E-003 +Relative difference = 0.0021940095370041636 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.476563e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.476825e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.476825e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.251653e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.251887e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.251887e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.234385 sec +TOTAL : 6.403228 sec INFO: No Floating Point Exceptions have been reported - 17,827,850,406 cycles # 2.859 GHz - 53,580,311,893 instructions # 3.01 insn per cycle - 6.238431546 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) + 18,326,271,023 cycles # 2.861 GHz + 53,904,115,772 instructions # 2.94 insn per cycle + 6.407222242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087582491E-003 -Relative difference = 2.1198118933954545e-08 +Avg ME (F77/C++) = 9.8479612087572898E-003 +Relative difference = 2.1198021522715588e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.320430e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320855e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320855e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.311127e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.311516e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.311516e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.594879 sec +TOTAL : 1.599283 sec INFO: No Floating Point Exceptions have been reported - 4,567,314,747 cycles # 2.858 GHz - 13,755,226,123 instructions # 3.01 insn per cycle - 1.598972704 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) + 4,591,437,389 cycles # 2.865 GHz + 13,807,198,493 instructions # 3.01 insn per cycle + 1.603441295 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896225560E-003 -Relative difference = 3.151694379513441e-08 +Avg ME (F77/C++) = 9.8479546896065809E-003 +Relative difference = 3.151856596628469e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.611819e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.613638e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.613638e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.548285e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.549831e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.549831e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.803348 sec +TOTAL : 0.811520 sec INFO: No Floating Point Exceptions have been reported - 2,141,149,617 cycles # 2.654 GHz - 4,818,402,736 instructions # 2.25 insn per cycle - 0.807515784 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) + 2,161,880,934 cycles # 2.653 GHz + 4,847,309,344 instructions # 2.24 insn per cycle + 0.815674039 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85905) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728161091923E-003 +Relative difference = 1.85880227405429e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.568391e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.570476e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.570476e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.566981e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.569078e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.569078e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.704023 sec +TOTAL : 0.702924 sec INFO: No Floating Point Exceptions have been reported - 1,875,444,352 cycles # 2.651 GHz - 4,275,225,721 instructions # 2.28 insn per cycle - 0.708142027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) + 1,875,062,433 cycles # 2.655 GHz + 4,301,131,886 instructions # 2.29 insn per cycle + 0.707001616 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81754) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728161091923E-003 +Relative difference = 1.85880227405429e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.586825e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588943e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588943e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.542502e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.544426e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.544426e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.806684 sec +TOTAL : 0.812292 sec INFO: No Floating Point Exceptions have been reported - 1,360,116,629 cycles # 1.679 GHz - 2,164,473,202 instructions # 1.59 insn per cycle - 0.810866699 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3487) (512y: 34) (512z:79499) + 1,367,647,712 cycles # 1.677 GHz + 2,175,950,494 instructions # 1.59 insn per cycle + 0.816336991 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4106) (512y: 32) (512z:79555) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982955140E-003 -Relative difference = 2.0044060904369713e-08 +Avg ME (F77/C++) = 9.8929811982957326E-003 +Relative difference = 2.0044082998332894e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 74b152faa4..12658e1990 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:03:51 +DATE: 2024-06-03_18:10:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.688684e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.689237e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.689554e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.691014e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.691544e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.691771e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.181191 sec +TOTAL : 2.180865 sec INFO: No Floating Point Exceptions have been reported - 7,135,001,611 cycles # 2.850 GHz - 13,803,494,373 instructions # 1.93 insn per cycle - 2.561335539 seconds time elapsed + 7,147,217,142 cycles # 2.856 GHz + 15,944,601,902 instructions # 2.23 insn per cycle + 2.561081631 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.110278e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110643e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110684e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111225e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111549e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111585e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.414487 sec +TOTAL : 3.420115 sec INFO: No Floating Point Exceptions have been reported - 10,695,558,623 cycles # 2.859 GHz - 24,933,179,401 instructions # 2.33 insn per cycle - 3.798291401 seconds time elapsed + 10,709,106,790 cycles # 2.860 GHz + 22,014,750,790 instructions # 2.06 insn per cycle + 3.803081141 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.308623e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.308815e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.308815e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.215233e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.215412e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.215412e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.229792 sec +TOTAL : 7.320173 sec INFO: No Floating Point Exceptions have been reported - 19,160,887,501 cycles # 2.649 GHz - 54,158,064,644 instructions # 2.83 insn per cycle - 7.233871441 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) + 19,183,009,981 cycles # 2.620 GHz + 54,142,256,875 instructions # 2.82 insn per cycle + 7.324194367 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32014) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.495915e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.496004e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.496004e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.502066e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.502148e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.502148e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.534835 sec +TOTAL : 3.520744 sec INFO: No Floating Point Exceptions have been reported - 9,323,019,385 cycles # 2.635 GHz - 26,159,152,582 instructions # 2.81 insn per cycle - 3.538902916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) + 9,378,849,301 cycles # 2.662 GHz + 26,196,672,297 instructions # 2.79 insn per cycle + 3.524768328 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96060) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.446264e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.446692e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.446692e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.457096e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.457531e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.457531e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.538022 sec +TOTAL : 1.532469 sec INFO: No Floating Point Exceptions have been reported - 4,070,123,740 cycles # 2.641 GHz - 9,227,321,198 instructions # 2.27 insn per cycle - 1.542213209 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) + 4,070,106,033 cycles # 2.650 GHz + 9,257,201,603 instructions # 2.27 insn per cycle + 1.536642002 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84397) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.010931e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.011595e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.011595e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.982544e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.983110e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.983110e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.321905 sec +TOTAL : 1.331088 sec INFO: No Floating Point Exceptions have been reported - 3,507,528,638 cycles # 2.647 GHz - 8,174,534,380 instructions # 2.33 insn per cycle - 1.326023150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) + 3,549,141,158 cycles # 2.659 GHz + 8,190,051,527 instructions # 2.31 insn per cycle + 1.335300673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80028) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.380863e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.381372e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.381372e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.356629e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.357180e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.357180e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.567452 sec +TOTAL : 1.578537 sec INFO: No Floating Point Exceptions have been reported - 2,624,094,649 cycles # 1.671 GHz - 4,154,491,609 instructions # 1.58 insn per cycle - 1.571532309 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) + 2,632,257,170 cycles # 1.664 GHz + 4,179,824,767 instructions # 1.59 insn per cycle + 1.582628942 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2614) (512y: 93) (512z:78909) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 8617043553..3ead6e031a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:04:44 +DATE: 2024-06-03_18:11:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.686084e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.686624e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.686882e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.676327e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.676864e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.677096e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.181122 sec +TOTAL : 2.185697 sec INFO: No Floating Point Exceptions have been reported - 7,147,611,369 cycles # 2.854 GHz - 14,853,271,942 instructions # 2.08 insn per cycle - 2.560951446 seconds time elapsed + 7,180,493,419 cycles # 2.851 GHz + 16,069,308,155 instructions # 2.24 insn per cycle + 2.576733860 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.108087e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108460e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108504e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112258e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112629e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.416139 sec +TOTAL : 3.410661 sec INFO: No Floating Point Exceptions have been reported - 10,697,523,108 cycles # 2.858 GHz - 25,124,574,585 instructions # 2.35 insn per cycle - 3.798642472 seconds time elapsed + 10,670,880,741 cycles # 2.855 GHz + 24,536,777,363 instructions # 2.30 insn per cycle + 3.793405431 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.809282e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.809493e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.809493e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.751942e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.752149e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.752149e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.765846 sec +TOTAL : 6.826971 sec INFO: No Floating Point Exceptions have been reported - 19,320,733,855 cycles # 2.854 GHz - 54,152,931,560 instructions # 2.80 insn per cycle - 6.769909136 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) + 19,135,732,377 cycles # 2.802 GHz + 54,164,572,300 instructions # 2.83 insn per cycle + 6.831071371 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32216) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.498471e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.498552e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498552e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.488222e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.488300e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.488300e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.528542 sec +TOTAL : 3.552677 sec INFO: No Floating Point Exceptions have been reported - 9,412,357,855 cycles # 2.665 GHz - 26,078,069,796 instructions # 2.77 insn per cycle - 3.532621942 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) + 9,347,642,331 cycles # 2.629 GHz + 26,094,609,262 instructions # 2.79 insn per cycle + 3.556816913 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95949) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.535316e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.535751e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.535751e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.489469e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489938e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489938e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.497994 sec +TOTAL : 1.518082 sec INFO: No Floating Point Exceptions have been reported - 4,026,588,228 cycles # 2.682 GHz - 9,213,775,354 instructions # 2.29 insn per cycle - 1.502163303 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) + 4,031,634,285 cycles # 2.650 GHz + 9,220,204,880 instructions # 2.29 insn per cycle + 1.522254552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83871) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.075585e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.076181e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.076181e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.987027e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.987615e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.987615e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.300193 sec +TOTAL : 1.329462 sec INFO: No Floating Point Exceptions have been reported - 3,527,583,777 cycles # 2.706 GHz - 8,167,337,738 instructions # 2.32 insn per cycle - 1.304488773 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) + 3,525,556,122 cycles # 2.645 GHz + 8,174,876,823 instructions # 2.32 insn per cycle + 1.333461169 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79434) (512y: 229) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.471286e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.471806e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.471806e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.361926e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.362434e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.362434e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.527099 sec +TOTAL : 1.576463 sec INFO: No Floating Point Exceptions have been reported - 2,623,859,326 cycles # 1.714 GHz - 4,153,167,835 instructions # 1.58 insn per cycle - 1.531362339 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) + 2,635,723,343 cycles # 1.668 GHz + 4,174,517,203 instructions # 1.58 insn per cycle + 1.580636581 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1878) (512y: 175) (512z:78883) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index e2998d6ab4..22bd4ba540 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_20:59:25 +DATE: 2024-06-03_18:06:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.755226e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.274442e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.625510e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.420650e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.186340e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.600584e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455971 sec +TOTAL : 0.457505 sec INFO: No Floating Point Exceptions have been reported - 1,884,775,023 cycles # 2.810 GHz - 2,642,675,236 instructions # 1.40 insn per cycle - 0.728830794 seconds time elapsed + 1,899,272,623 cycles # 2.815 GHz + 2,683,069,645 instructions # 1.41 insn per cycle + 0.731629286 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.160169e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.139027e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.542035e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.262197e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.143942e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.550325e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.535733 sec +TOTAL : 0.536930 sec INFO: No Floating Point Exceptions have been reported - 2,191,012,800 cycles # 2.827 GHz - 3,140,675,750 instructions # 1.43 insn per cycle - 0.832367820 seconds time elapsed + 2,194,237,717 cycles # 2.824 GHz + 3,155,658,053 instructions # 1.44 insn per cycle + 0.833915066 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.013225e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.034611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.034611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.033633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.055928e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055928e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.640289 sec +TOTAL : 1.608174 sec INFO: No Floating Point Exceptions have been reported - 4,709,353,007 cycles # 2.865 GHz - 13,462,429,209 instructions # 2.86 insn per cycle - 1.644426545 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) + 4,627,352,906 cycles # 2.871 GHz + 13,198,524,909 instructions # 2.85 insn per cycle + 1.612420837 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 720) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.842535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.913005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.913005e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893565e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.912012 sec +TOTAL : 0.920644 sec INFO: No Floating Point Exceptions have been reported - 2,622,620,970 cycles # 2.864 GHz - 7,552,013,729 instructions # 2.88 insn per cycle - 0.916398164 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) + 2,651,938,832 cycles # 2.869 GHz + 7,562,803,805 instructions # 2.85 insn per cycle + 0.924952570 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.080547e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.281001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.281001e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.066566e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.264728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264728e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.555084 sec +TOTAL : 0.557193 sec INFO: No Floating Point Exceptions have been reported - 1,478,897,839 cycles # 2.647 GHz - 3,119,129,700 instructions # 2.11 insn per cycle - 0.559398478 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) + 1,496,586,992 cycles # 2.668 GHz + 3,166,660,688 instructions # 2.12 insn per cycle + 0.561453315 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3002) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.427781e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674591e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.416441e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.661152e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.661152e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501239 sec +TOTAL : 0.502498 sec INFO: No Floating Point Exceptions have been reported - 1,340,705,970 cycles # 2.655 GHz - 2,981,253,669 instructions # 2.22 insn per cycle - 0.505572840 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) + 1,353,923,140 cycles # 2.675 GHz + 3,021,240,550 instructions # 2.23 insn per cycle + 0.506695350 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2769) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.239263e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345599e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.242663e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.346403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.346403e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.757058 sec +TOTAL : 0.755886 sec INFO: No Floating Point Exceptions have been reported - 1,333,648,350 cycles # 1.754 GHz - 1,953,454,025 instructions # 1.46 insn per cycle - 0.761354043 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) + 1,335,151,347 cycles # 1.758 GHz + 1,969,686,710 instructions # 1.48 insn per cycle + 0.760145815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1391) (512y: 106) (512z: 2217) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index ea21ef5e35..bf06ae833c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:28:51 +DATE: 2024-06-03_18:26:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.419906e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.143705e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.143705e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.507743e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.244408e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.244408e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.485647 sec +TOTAL : 0.483649 sec INFO: No Floating Point Exceptions have been reported - 1,977,356,966 cycles # 2.831 GHz - 2,947,602,370 instructions # 1.49 insn per cycle - 0.756831394 seconds time elapsed + 1,963,989,630 cycles # 2.822 GHz + 2,915,685,197 instructions # 1.48 insn per cycle + 0.754321456 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.235107e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.556413e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.556413e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.256162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.571113e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.571113e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.757410 sec +TOTAL : 0.756334 sec INFO: No Floating Point Exceptions have been reported - 2,885,831,764 cycles # 2.864 GHz - 4,411,934,467 instructions # 1.53 insn per cycle - 1.066148439 seconds time elapsed + 2,857,254,096 cycles # 2.830 GHz + 4,390,239,241 instructions # 1.54 insn per cycle + 1.068102530 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.023172e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.045212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.045212e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028953e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051590e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.632598 sec +TOTAL : 1.623491 sec INFO: No Floating Point Exceptions have been reported - 4,757,113,003 cycles # 2.909 GHz - 13,469,643,583 instructions # 2.83 insn per cycle - 1.637219700 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) + 4,669,332,077 cycles # 2.869 GHz + 13,205,693,220 instructions # 2.83 insn per cycle + 1.628130363 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 720) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.853468e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.927865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.810728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.881304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.881304e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915572 sec +TOTAL : 0.936476 sec INFO: No Floating Point Exceptions have been reported - 2,673,568,028 cycles # 2.908 GHz - 7,602,475,789 instructions # 2.84 insn per cycle - 0.920287554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) + 2,697,621,166 cycles # 2.868 GHz + 7,612,311,404 instructions # 2.82 insn per cycle + 0.941119380 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.103941e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.314249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.314249e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.030271e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.230400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.230400e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.559050 sec +TOTAL : 0.572120 sec INFO: No Floating Point Exceptions have been reported - 1,524,948,008 cycles # 2.709 GHz - 3,168,482,011 instructions # 2.08 insn per cycle - 0.563565721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) + 1,540,597,129 cycles # 2.674 GHz + 3,215,714,008 instructions # 2.09 insn per cycle + 0.576670244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3002) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.488503e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.747329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.747329e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.370959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.617466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617466e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.500126 sec +TOTAL : 0.518302 sec INFO: No Floating Point Exceptions have been reported - 1,382,997,758 cycles # 2.744 GHz - 3,030,723,769 instructions # 2.19 insn per cycle - 0.504745068 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) + 1,404,029,571 cycles # 2.688 GHz + 3,072,237,447 instructions # 2.19 insn per cycle + 0.523023062 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2769) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.290455e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.403634e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.403634e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.218921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.326156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.326156e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.749461 sec +TOTAL : 0.772656 sec INFO: No Floating Point Exceptions have been reported - 1,376,974,835 cycles # 1.828 GHz - 1,993,483,040 instructions # 1.45 insn per cycle - 0.754191421 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) + 1,381,653,206 cycles # 1.779 GHz + 2,009,158,338 instructions # 1.45 insn per cycle + 0.777302997 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1391) (512y: 106) (512z: 2217) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index e245581a8d..528c14820e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_20:59:38 +DATE: 2024-06-03_18:06:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.726582e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.144906e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.472843e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.387528e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.087353e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.479798e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.456749 sec +TOTAL : 0.454641 sec INFO: No Floating Point Exceptions have been reported - 1,887,155,378 cycles # 2.815 GHz - 2,674,611,026 instructions # 1.42 insn per cycle - 0.729330088 seconds time elapsed + 1,892,592,028 cycles # 2.818 GHz + 2,644,916,478 instructions # 1.40 insn per cycle + 0.728910769 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.182415e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.046068e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.438938e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.251456e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.055592e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.448912e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537611 sec +TOTAL : 0.537942 sec INFO: No Floating Point Exceptions have been reported - 2,190,802,810 cycles # 2.823 GHz - 3,133,468,280 instructions # 1.43 insn per cycle - 0.833932090 seconds time elapsed + 2,196,827,706 cycles # 2.825 GHz + 3,163,011,259 instructions # 1.44 insn per cycle + 0.835779565 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.009988e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.031220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.031220e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051109e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.645685 sec +TOTAL : 1.615636 sec INFO: No Floating Point Exceptions have been reported - 4,722,807,018 cycles # 2.864 GHz - 13,456,640,489 instructions # 2.85 insn per cycle - 1.649922962 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) + 4,630,123,707 cycles # 2.860 GHz + 13,187,048,762 instructions # 2.85 insn per cycle + 1.619942385 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 705) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.814484e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.883785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.883785e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.814458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.882953e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882953e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.926401 sec +TOTAL : 0.925381 sec INFO: No Floating Point Exceptions have been reported - 2,657,840,392 cycles # 2.859 GHz - 7,551,476,794 instructions # 2.84 insn per cycle - 0.930660111 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) + 2,650,904,425 cycles # 2.853 GHz + 7,560,878,860 instructions # 2.85 insn per cycle + 0.929809663 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3110) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.099699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.310121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.310121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.042543e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.239403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.239403e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.551668 sec +TOTAL : 0.561428 sec INFO: No Floating Point Exceptions have been reported - 1,479,744,333 cycles # 2.665 GHz - 3,118,004,055 instructions # 2.11 insn per cycle - 0.555928182 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) + 1,506,189,984 cycles # 2.666 GHz + 3,165,859,184 instructions # 2.10 insn per cycle + 0.565660418 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2987) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.438094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.687392e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.687392e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.386752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.627703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.627703e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.499527 sec +TOTAL : 0.507110 sec INFO: No Floating Point Exceptions have been reported - 1,342,931,484 cycles # 2.669 GHz - 2,978,966,446 instructions # 2.22 insn per cycle - 0.503782569 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) + 1,357,913,949 cycles # 2.659 GHz + 3,018,493,039 instructions # 2.22 insn per cycle + 0.511448840 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2743) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.249330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.355241e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.355241e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.238275e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.344573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.344573e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.753724 sec +TOTAL : 0.757097 sec INFO: No Floating Point Exceptions have been reported - 1,329,841,706 cycles # 1.756 GHz - 1,951,471,549 instructions # 1.47 insn per cycle - 0.758013007 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) + 1,334,322,654 cycles # 1.754 GHz + 1,967,860,641 instructions # 1.47 insn per cycle + 0.761402568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1368) (512y: 106) (512z: 2217) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 3a86532d9d..7c8aa5030d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_20:59:52 +DATE: 2024-06-03_18:06:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.556743e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220581e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.348241e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.550928e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.024345e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.134616e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.446170 sec +TOTAL : 0.449386 sec INFO: No Floating Point Exceptions have been reported - 1,860,217,734 cycles # 2.814 GHz - 2,643,257,485 instructions # 1.42 insn per cycle - 0.717600829 seconds time elapsed + 1,888,054,726 cycles # 2.813 GHz + 2,654,260,383 instructions # 1.41 insn per cycle + 0.728565297 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 165 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.900642e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.805238e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.969673e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.486879 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.491395e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.519769e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624892e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 +TOTAL : 0.490219 sec INFO: No Floating Point Exceptions have been reported - 2,017,900,407 cycles # 2.822 GHz - 2,883,772,994 instructions # 1.43 insn per cycle - 0.772554185 seconds time elapsed + 2,021,512,627 cycles # 2.815 GHz + 2,891,282,771 instructions # 1.43 insn per cycle + 0.776542002 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 +Avg ME (F77/GPU) = 0.14247487904286338 +Relative difference = 0.0003670698531228044 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.071641e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095909e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.550578 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.082711e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107977e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.534634 sec INFO: No Floating Point Exceptions have been reported - 4,458,106,622 cycles # 2.869 GHz - 13,047,664,900 instructions # 2.93 insn per cycle - 1.554777575 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) + 4,415,097,381 cycles # 2.871 GHz + 12,958,570,213 instructions # 2.94 insn per cycle + 1.538797105 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 658) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.867972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.052825e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.052825e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.592805 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.819766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.993390e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.993390e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.601639 sec INFO: No Floating Point Exceptions have been reported - 1,702,248,935 cycles # 2.855 GHz - 4,512,704,282 instructions # 2.65 insn per cycle - 0.597040153 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) + 1,735,482,071 cycles # 2.869 GHz + 4,549,139,048 instructions # 2.62 insn per cycle + 0.605662961 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.475375e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.145092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.145092e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.533626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.225203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.225203e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.320934 sec +TOTAL : 0.317106 sec INFO: No Floating Point Exceptions have been reported - 853,448,782 cycles # 2.630 GHz - 1,896,008,778 instructions # 2.22 insn per cycle - 0.325158867 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) + 862,393,564 cycles # 2.690 GHz + 1,924,665,702 instructions # 2.23 insn per cycle + 0.321250486 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3584) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.798508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.559778e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.559778e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.943619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.727716e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.727716e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.304197 sec +TOTAL : 0.296810 sec INFO: No Floating Point Exceptions have been reported - 803,501,124 cycles # 2.610 GHz - 1,818,839,783 instructions # 2.26 insn per cycle - 0.308391634 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) + 810,008,261 cycles # 2.697 GHz + 1,841,327,409 instructions # 2.27 insn per cycle + 0.300975383 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3414) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.329166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.749966e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.749966e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.370645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.799268e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.799268e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.402147 sec +TOTAL : 0.398078 sec INFO: No Floating Point Exceptions have been reported - 735,010,684 cycles # 1.812 GHz - 1,304,684,504 instructions # 1.78 insn per cycle - 0.406336425 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) + 737,475,489 cycles # 1.836 GHz + 1,315,863,036 instructions # 1.78 insn per cycle + 0.402348444 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2005) (512y: 32) (512z: 2432) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 94d91d36db..b474391c5e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:29:04 +DATE: 2024-06-03_18:26:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.395370e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.197522e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.197522e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.460501 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.399533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.035371e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.035371e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 +TOTAL : 0.461527 sec INFO: No Floating Point Exceptions have been reported - 1,922,773,738 cycles # 2.852 GHz - 2,838,003,502 instructions # 1.48 insn per cycle - 0.731092352 seconds time elapsed + 1,898,709,085 cycles # 2.819 GHz + 2,794,611,500 instructions # 1.47 insn per cycle + 0.730805065 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 165 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.025472e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.955203e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.955203e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.631520 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.018371e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.847717e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.847717e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 +TOTAL : 0.631803 sec INFO: No Floating Point Exceptions have been reported - 2,501,267,965 cycles # 2.864 GHz - 3,790,017,450 instructions # 1.52 insn per cycle - 0.930310379 seconds time elapsed + 2,475,233,605 cycles # 2.829 GHz + 3,755,433,498 instructions # 1.52 insn per cycle + 0.931704895 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -95,8 +95,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 +Avg ME (F77/GPU) = 0.14247487904286338 +Relative difference = 0.0003670698531228044 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.083915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108811e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108811e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.537914 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.077062e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.103594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103594e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.546741 sec INFO: No Floating Point Exceptions have been reported - 4,486,361,889 cycles # 2.911 GHz - 13,052,814,653 instructions # 2.91 insn per cycle - 1.542180616 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) + 4,439,504,684 cycles # 2.864 GHz + 12,962,981,414 instructions # 2.92 insn per cycle + 1.551131545 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 658) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.870842e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.059379e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.059379e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.597066 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.800148e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.977418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.977418e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.611068 sec INFO: No Floating Point Exceptions have been reported - 1,730,283,965 cycles # 2.880 GHz - 4,559,978,438 instructions # 2.64 insn per cycle - 0.601626043 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) + 1,762,180,341 cycles # 2.866 GHz + 4,596,893,163 instructions # 2.61 insn per cycle + 0.615465189 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.495825e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.177328e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.177328e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.468716e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.144921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.144921e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.324345 sec +TOTAL : 0.325673 sec INFO: No Floating Point Exceptions have been reported - 873,246,909 cycles # 2.662 GHz - 1,932,851,891 instructions # 2.21 insn per cycle - 0.328693833 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) + 885,472,071 cycles # 2.688 GHz + 1,961,444,173 instructions # 2.22 insn per cycle + 0.330024390 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3584) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.853914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.645046e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.645046e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.852754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.629189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.629189e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.305954 sec +TOTAL : 0.306077 sec INFO: No Floating Point Exceptions have been reported - 825,418,764 cycles # 2.665 GHz - 1,855,748,763 instructions # 2.25 insn per cycle - 0.310269538 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) + 831,476,352 cycles # 2.684 GHz + 1,878,263,824 instructions # 2.26 insn per cycle + 0.310443537 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3414) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.313371e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.736001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.736001e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.306370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.724359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.724359e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.408387 sec +TOTAL : 0.408894 sec INFO: No Floating Point Exceptions have been reported - 758,493,445 cycles # 1.840 GHz - 1,345,737,027 instructions # 1.77 insn per cycle - 0.412742502 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) + 761,582,213 cycles # 1.846 GHz + 1,357,292,696 instructions # 1.78 insn per cycle + 0.413233854 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2005) (512y: 32) (512z: 2432) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 05c0e197eb..dad301565b 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:00:04 +DATE: 2024-06-03_18:07:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.474693e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.189633e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.316136e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.607493e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041375e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154202e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.447182 sec +TOTAL : 0.451358 sec INFO: No Floating Point Exceptions have been reported - 1,893,556,574 cycles # 2.816 GHz - 2,664,714,918 instructions # 1.41 insn per cycle - 0.729010855 seconds time elapsed + 1,877,588,397 cycles # 2.819 GHz + 2,627,708,031 instructions # 1.40 insn per cycle + 0.724593614 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 164 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.711155e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.788726e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.932305e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.487543 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.561024e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.555409e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.667951e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 +TOTAL : 0.490635 sec INFO: No Floating Point Exceptions have been reported - 2,013,294,748 cycles # 2.813 GHz - 2,871,604,214 instructions # 1.43 insn per cycle - 0.773944888 seconds time elapsed + 2,044,448,827 cycles # 2.838 GHz + 2,884,451,461 instructions # 1.41 insn per cycle + 0.778204852 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 +Avg ME (F77/GPU) = 0.14247487904286338 +Relative difference = 0.0003670698531228044 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093696e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.093696e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.554121 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.081260e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.105951e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.105951e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.536942 sec INFO: No Floating Point Exceptions have been reported - 4,457,740,413 cycles # 2.864 GHz - 13,029,198,665 instructions # 2.92 insn per cycle - 1.558378600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) + 4,412,513,718 cycles # 2.865 GHz + 12,934,261,743 instructions # 2.93 insn per cycle + 1.541139293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.886512e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.070924e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.070924e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.588580 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.813029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.990415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.990415e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.603038 sec INFO: No Floating Point Exceptions have been reported - 1,693,887,931 cycles # 2.861 GHz - 4,507,886,410 instructions # 2.66 insn per cycle - 0.592760485 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) + 1,737,431,119 cycles # 2.864 GHz + 4,543,468,610 instructions # 2.62 insn per cycle + 0.607274552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.575079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.270984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.270984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.462861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.127062e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.127062e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.315242 sec +TOTAL : 0.321234 sec INFO: No Floating Point Exceptions have been reported - 850,868,776 cycles # 2.668 GHz - 1,892,927,301 instructions # 2.22 insn per cycle - 0.319517780 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) + 866,319,085 cycles # 2.667 GHz + 1,921,313,606 instructions # 2.22 insn per cycle + 0.325461095 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3554) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.005606e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.810836e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.810836e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.908411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.688910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.688910e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.293957 sec +TOTAL : 0.298501 sec INFO: No Floating Point Exceptions have been reported - 798,554,126 cycles # 2.684 GHz - 1,814,787,943 instructions # 2.27 insn per cycle - 0.298209331 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) + 810,564,237 cycles # 2.685 GHz + 1,837,469,078 instructions # 2.27 insn per cycle + 0.302652949 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3378) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.328007e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.744710e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.744710e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.339310e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.756992e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.756992e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.402038 sec +TOTAL : 0.400857 sec INFO: No Floating Point Exceptions have been reported - 736,423,468 cycles # 1.816 GHz - 1,301,837,346 instructions # 1.77 insn per cycle - 0.406202485 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1936) (512y: 32) (512z: 2382) + 737,225,912 cycles # 1.823 GHz + 1,313,545,461 instructions # 1.78 insn per cycle + 0.404928677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2435) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 0c9965805b..7de221c727 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:00:16 +DATE: 2024-06-03_18:07:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.769571e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.350019e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.722049e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.438101e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.278946e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.705921e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.450882 sec +TOTAL : 0.458495 sec INFO: No Floating Point Exceptions have been reported - 1,880,734,206 cycles # 2.821 GHz - 2,660,717,871 instructions # 1.41 insn per cycle - 0.723822735 seconds time elapsed + 1,899,751,176 cycles # 2.820 GHz + 2,680,130,955 instructions # 1.41 insn per cycle + 0.732599330 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.238115e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.167791e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.574879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.270088e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.149686e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.558133e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.536771 sec +TOTAL : 0.538291 sec INFO: No Floating Point Exceptions have been reported - 2,185,532,891 cycles # 2.814 GHz - 3,100,266,437 instructions # 1.42 insn per cycle - 0.833787821 seconds time elapsed + 2,196,213,159 cycles # 2.822 GHz + 3,128,470,327 instructions # 1.42 insn per cycle + 0.835056301 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.000819e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.021896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.021896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.022751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044788e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.044788e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.660472 sec +TOTAL : 1.625328 sec INFO: No Floating Point Exceptions have been reported - 4,752,171,523 cycles # 2.856 GHz - 13,466,883,992 instructions # 2.83 insn per cycle - 1.664681738 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) + 4,655,424,429 cycles # 2.864 GHz + 13,186,683,602 instructions # 2.83 insn per cycle + 1.629533606 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 694) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.835752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.905918e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.905918e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893791e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915273 sec +TOTAL : 0.920546 sec INFO: No Floating Point Exceptions have been reported - 2,607,019,310 cycles # 2.837 GHz - 7,384,430,613 instructions # 2.83 insn per cycle - 0.919590344 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) + 2,650,850,704 cycles # 2.868 GHz + 7,482,411,808 instructions # 2.82 insn per cycle + 0.924860801 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3164) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.096035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.299954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.299954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.105613e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.308591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.308591e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.552392 sec +TOTAL : 0.550662 sec INFO: No Floating Point Exceptions have been reported - 1,470,210,795 cycles # 2.644 GHz - 3,054,979,092 instructions # 2.08 insn per cycle - 0.556662961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) + 1,483,239,290 cycles # 2.676 GHz + 3,134,997,530 instructions # 2.11 insn per cycle + 0.554907910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3137) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.519152e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.779200e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.779200e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.488353e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.745095e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745095e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.488880 sec +TOTAL : 0.492475 sec INFO: No Floating Point Exceptions have been reported - 1,309,516,900 cycles # 2.658 GHz - 2,929,953,488 instructions # 2.24 insn per cycle - 0.493142435 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) + 1,327,041,740 cycles # 2.675 GHz + 2,988,516,661 instructions # 2.25 insn per cycle + 0.496736378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2905) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.182113e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.279394e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.279394e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.181984e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.280563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.280563e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.776228 sec +TOTAL : 0.776078 sec INFO: No Floating Point Exceptions have been reported - 1,367,728,530 cycles # 1.754 GHz - 1,969,246,999 instructions # 1.44 insn per cycle - 0.780474856 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) + 1,372,083,988 cycles # 1.760 GHz + 1,997,444,155 instructions # 1.46 insn per cycle + 0.780337995 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1737) (512y: 114) (512z: 2251) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 9ad9b977c8..04dcf15a7d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:00:30 +DATE: 2024-06-03_18:07:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.725668e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.143482e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.487281e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.392663e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.089452e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.491656e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455675 sec +TOTAL : 0.456745 sec INFO: No Floating Point Exceptions have been reported - 1,885,065,360 cycles # 2.812 GHz - 2,651,104,600 instructions # 1.41 insn per cycle - 0.728600735 seconds time elapsed + 1,899,216,031 cycles # 2.821 GHz + 2,684,077,764 instructions # 1.41 insn per cycle + 0.731359464 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.168974e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.005384e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.392584e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.246268e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.016380e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.406292e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537275 sec +TOTAL : 0.539355 sec INFO: No Floating Point Exceptions have been reported - 2,188,989,186 cycles # 2.824 GHz - 3,146,111,555 instructions # 1.44 insn per cycle - 0.833354304 seconds time elapsed + 2,201,245,823 cycles # 2.828 GHz + 3,151,089,400 instructions # 1.43 insn per cycle + 0.836766488 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.007306e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.028735e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.028735e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050829e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.649832 sec +TOTAL : 1.616534 sec INFO: No Floating Point Exceptions have been reported - 4,736,304,640 cycles # 2.865 GHz - 13,451,261,336 instructions # 2.84 insn per cycle - 1.654076225 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) + 4,647,619,159 cycles # 2.869 GHz + 13,174,118,623 instructions # 2.83 insn per cycle + 1.620696566 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.856269e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.927884e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.811251e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.880051e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.880051e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.905701 sec +TOTAL : 0.927238 sec INFO: No Floating Point Exceptions have been reported - 2,609,412,517 cycles # 2.870 GHz - 7,388,220,177 instructions # 2.83 insn per cycle - 0.909956215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) + 2,650,771,569 cycles # 2.848 GHz + 7,484,677,618 instructions # 2.82 insn per cycle + 0.931497652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.132823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.336633e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.336633e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.110568e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.314066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.314066e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545981 sec +TOTAL : 0.549601 sec INFO: No Floating Point Exceptions have been reported - 1,469,511,109 cycles # 2.674 GHz - 3,055,566,040 instructions # 2.08 insn per cycle - 0.550174078 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) + 1,478,789,605 cycles # 2.673 GHz + 3,134,923,716 instructions # 2.12 insn per cycle + 0.553915068 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3115) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.536658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.797118e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.797118e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.471635e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.726223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.726223e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.486288 sec +TOTAL : 0.494844 sec INFO: No Floating Point Exceptions have been reported - 1,307,118,138 cycles # 2.675 GHz - 2,931,084,096 instructions # 2.24 insn per cycle - 0.490604178 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) + 1,330,853,141 cycles # 2.670 GHz + 2,989,219,775 instructions # 2.25 insn per cycle + 0.499076847 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2881) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.194909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.292767e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292767e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.171319e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.270839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.270839e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.771280 sec +TOTAL : 0.779784 sec INFO: No Floating Point Exceptions have been reported - 1,369,689,187 cycles # 1.768 GHz - 1,969,498,668 instructions # 1.44 insn per cycle - 0.775554324 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) + 1,374,242,106 cycles # 1.754 GHz + 1,997,349,171 instructions # 1.45 insn per cycle + 0.784044628 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 114) (512z: 2251) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 529929a5c3..d754778efa 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:05:44 +DATE: 2024-06-03_18:44:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.419887e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.089577e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.185165e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.328854e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.085787e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186541e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.533774 sec +TOTAL : 0.531742 sec INFO: No Floating Point Exceptions have been reported - 2,165,329,100 cycles # 2.817 GHz - 3,116,509,991 instructions # 1.44 insn per cycle - 0.826546475 seconds time elapsed + 2,178,901,628 cycles # 2.821 GHz + 3,143,409,478 instructions # 1.44 insn per cycle + 0.829473381 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865718e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.915718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.915718e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.580845e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.617291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.617291e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 5.734321 sec +TOTAL : 6.748885 sec INFO: No Floating Point Exceptions have been reported - 16,442,910,174 cycles # 2.865 GHz - 42,483,732,959 instructions # 2.58 insn per cycle - 5.739954746 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 711) (avx2: 0) (512y: 0) (512z: 0) + 19,295,144,002 cycles # 2.857 GHz + 51,955,343,402 instructions # 2.69 insn per cycle + 6.754523888 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.238803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404775e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.814420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.940644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.940644e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.350989 sec +TOTAL : 3.841500 sec INFO: No Floating Point Exceptions have been reported - 9,605,090,400 cycles # 2.862 GHz - 26,316,930,760 instructions # 2.74 insn per cycle - 3.356479084 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2388) (avx2: 0) (512y: 0) (512z: 0) + 10,991,556,497 cycles # 2.858 GHz + 30,794,493,414 instructions # 2.80 insn per cycle + 3.846988521 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2929) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926105804 -Relative difference = 2.103617270732513e-07 +Avg ME (F77/C++) = 4.3134710926105795 +Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.211332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.644021e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.644021e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.529737e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.853643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.853643e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.125658 sec +TOTAL : 2.429427 sec INFO: No Floating Point Exceptions have been reported - 5,695,554,831 cycles # 2.674 GHz - 12,026,163,349 instructions # 2.11 insn per cycle - 2.131034660 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2532) (512y: 0) (512z: 0) + 6,484,403,568 cycles # 2.664 GHz + 13,670,821,767 instructions # 2.11 insn per cycle + 2.434855691 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2952) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.667087e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.177658e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.177658e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.971751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.359885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.359885e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 1.963849 sec +TOTAL : 2.222913 sec INFO: No Floating Point Exceptions have been reported - 5,196,538,426 cycles # 2.640 GHz - 11,156,532,822 instructions # 2.15 insn per cycle - 1.969477022 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2195) (512y: 148) (512z: 0) + 5,945,532,829 cycles # 2.669 GHz + 13,012,290,212 instructions # 2.19 insn per cycle + 2.228300354 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2684) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.473166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.654961e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.654961e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.316769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.483180e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483180e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.132981 sec +TOTAL : 3.275285 sec INFO: No Floating Point Exceptions have been reported - 5,562,641,747 cycles # 1.773 GHz - 8,071,126,847 instructions # 1.45 insn per cycle - 3.138593943 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1471) (512y: 129) (512z: 1684) + 5,818,933,451 cycles # 1.775 GHz + 8,593,198,148 instructions # 1.48 insn per cycle + 3.280653540 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1518) (512y: 128) (512z: 1942) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index 50bff49e4f..d9a36dcf38 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:06:09 +DATE: 2024-06-03_18:45:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.425736e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093257e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.189942e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.309379e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.084437e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188085e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.532512 sec +TOTAL : 0.534384 sec INFO: No Floating Point Exceptions have been reported - 2,171,060,205 cycles # 2.823 GHz - 3,109,256,727 instructions # 1.43 insn per cycle - 0.825854836 seconds time elapsed + 2,166,481,115 cycles # 2.819 GHz + 3,106,540,627 instructions # 1.43 insn per cycle + 0.827144727 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.886140e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.937260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.937260e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.668736e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.708844e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.708844e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 5.672763 sec +TOTAL : 6.397768 sec INFO: No Floating Point Exceptions have been reported - 16,265,601,075 cycles # 2.865 GHz - 43,265,334,700 instructions # 2.66 insn per cycle - 5.678175360 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 662) (avx2: 0) (512y: 0) (512z: 0) + 18,446,624,473 cycles # 2.881 GHz + 50,082,150,196 instructions # 2.71 insn per cycle + 6.403499792 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 639) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.297388e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.470079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.470079e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.001026e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.144373e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.144373e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.294375 sec +TOTAL : 3.608611 sec INFO: No Floating Point Exceptions have been reported - 9,446,450,203 cycles # 2.864 GHz - 25,429,379,126 instructions # 2.69 insn per cycle - 3.299817979 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2268) (avx2: 0) (512y: 0) (512z: 0) + 10,359,715,515 cycles # 2.867 GHz + 29,167,539,829 instructions # 2.82 insn per cycle + 3.614119568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926105804 -Relative difference = 2.103617270732513e-07 +Avg ME (F77/C++) = 4.3134710926105795 +Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.653725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.997172e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.997172e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.233371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.513596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.513596e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.366874 sec +TOTAL : 2.591194 sec INFO: No Floating Point Exceptions have been reported - 6,282,545,209 cycles # 2.649 GHz - 13,637,137,621 instructions # 2.17 insn per cycle - 2.372362603 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2629) (512y: 0) (512z: 0) + 6,943,168,004 cycles # 2.675 GHz + 15,152,528,398 instructions # 2.18 insn per cycle + 2.596694726 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3032) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.883827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.255659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.255659e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.379993e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.676958e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.676958e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.260406 sec +TOTAL : 2.507726 sec INFO: No Floating Point Exceptions have been reported - 6,053,972,710 cycles # 2.673 GHz - 12,722,135,295 instructions # 2.10 insn per cycle - 2.265888998 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 296) (512z: 0) + 6,714,039,983 cycles # 2.673 GHz + 14,625,463,624 instructions # 2.18 insn per cycle + 2.513162943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2634) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.425130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.601710e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.601710e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.180932e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.333512e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333512e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.176107 sec +TOTAL : 3.410152 sec INFO: No Floating Point Exceptions have been reported - 5,633,600,912 cycles # 1.772 GHz - 8,927,465,538 instructions # 1.58 insn per cycle - 3.181704052 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1357) (512y: 171) (512z: 1777) + 6,047,366,940 cycles # 1.771 GHz + 10,343,477,286 instructions # 1.71 insn per cycle + 3.415537139 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1280) (512y: 214) (512z: 2129) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 2f0a202d23..9958b76d33 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:06:34 +DATE: 2024-06-03_18:45:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.564465e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483919e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.773757e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.537240e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.970306e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.166568e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.486107 sec +TOTAL : 0.488119 sec INFO: No Floating Point Exceptions have been reported - 2,007,999,810 cycles # 2.821 GHz - 2,899,840,704 instructions # 1.44 insn per cycle - 0.768580858 seconds time elapsed + 2,010,457,548 cycles # 2.816 GHz + 2,901,449,651 instructions # 1.44 insn per cycle + 0.771150125 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 157 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695463908836 -Relative difference = 4.162439020000051e-05 +Avg ME (F77/GPU) = 4.3136695491848513 +Relative difference = 4.162503792787837e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.944128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.000566e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.000566e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.637958e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.678542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.678542e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 5.484463 sec +TOTAL : 6.492871 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 15,737,215,094 cycles # 2.868 GHz - 42,223,129,627 instructions # 2.68 insn per cycle - 5.489744872 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 601) (avx2: 0) (512y: 0) (512z: 0) + 18,609,085,451 cycles # 2.864 GHz + 51,236,813,347 instructions # 2.75 insn per cycle + 6.497972416 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135739049175754 -Relative difference = 2.2042608890083832e-08 +Avg ME (F77/C++) = 4.3135738277342170 +Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.521211e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.866032e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.866032e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.908557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.163450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.163450e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.410232 sec +TOTAL : 2.773748 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,949,798,990 cycles # 2.877 GHz - 16,918,935,545 instructions # 2.43 insn per cycle - 2.415585542 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2983) (avx2: 0) (512y: 0) (512z: 0) + 7,945,872,155 cycles # 2.860 GHz + 19,321,820,400 instructions # 2.43 insn per cycle + 2.778949610 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -130,8 +130,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313572e+00 -Avg ME (F77/C++) = 4.3135722205042839 -Relative difference = 5.111872113533787e-08 +Avg ME (F77/C++) = 4.3135722697479650 +Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -141,27 +141,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.866213e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.881041e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.881041e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.601030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.562195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.562195e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.422293 sec +TOTAL : 1.469342 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,860,334,790 cycles # 2.706 GHz - 7,989,354,890 instructions # 2.07 insn per cycle - 1.427639931 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3289) (512y: 0) (512z: 0) + 3,953,006,654 cycles # 2.682 GHz + 8,836,458,479 instructions # 2.24 insn per cycle + 1.474496107 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3719) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645699221641 -Relative difference = 9.97035713074993e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -171,27 +169,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.312467e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.446145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.446145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.080394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.190045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.190045e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.350232 sec +TOTAL : 1.388542 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,671,588,520 cycles # 2.710 GHz - 7,492,175,118 instructions # 2.04 insn per cycle - 1.355454952 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3036) (512y: 23) (512z: 0) + 3,738,146,624 cycles # 2.684 GHz + 8,439,003,248 instructions # 2.26 insn per cycle + 1.393671238 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3555) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645699221641 -Relative difference = 9.97035713074993e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -201,16 +197,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.291103e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.907078e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.907078e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.739991e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.256621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.256621e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.755277 sec +TOTAL : 1.918383 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,329,330,756 cycles # 1.892 GHz - 5,989,173,339 instructions # 1.80 insn per cycle - 1.760500099 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2418) (512y: 32) (512z: 2031) + 3,506,051,293 cycles # 1.824 GHz + 6,249,171,634 instructions # 1.78 insn per cycle + 1.923680281 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2453) (512y: 32) (512z: 2288) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -220,8 +216,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643783025444 -Relative difference = 8.770069111236825e-08 +Avg ME (F77/C++) = 4.3135643536224961 +Relative difference = 8.197919301304478e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 947a9772a4..1171fdfffd 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:06:54 +DATE: 2024-06-03_18:45:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.661514e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.491247e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.780216e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.154335e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.008994e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.197996e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.485941 sec +TOTAL : 0.491386 sec INFO: No Floating Point Exceptions have been reported - 2,010,449,048 cycles # 2.824 GHz - 2,892,288,272 instructions # 1.44 insn per cycle - 0.769427686 seconds time elapsed + 2,017,077,998 cycles # 2.815 GHz + 2,893,778,786 instructions # 1.43 insn per cycle + 0.775392827 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 131 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695463908836 -Relative difference = 4.162439020000051e-05 +Avg ME (F77/GPU) = 4.3136695491848513 +Relative difference = 4.162503792787837e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.994628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.054364e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.054364e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.692645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.735566e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.735566e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 5.347389 sec +TOTAL : 6.286011 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 15,338,140,112 cycles # 2.867 GHz - 42,471,920,214 instructions # 2.77 insn per cycle - 5.352646805 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 559) (avx2: 0) (512y: 0) (512z: 0) + 18,009,509,756 cycles # 2.863 GHz + 49,623,104,208 instructions # 2.76 insn per cycle + 6.291263696 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135739491553977 -Relative difference = 1.1787117204016727e-08 +Avg ME (F77/C++) = 4.3135738277342170 +Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.117220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.566866e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.566866e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.394813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.720777e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.720777e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.140174 sec +TOTAL : 2.477788 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,135,855,566 cycles # 2.861 GHz - 16,262,350,066 instructions # 2.65 insn per cycle - 2.145522102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2702) (avx2: 0) (512y: 0) (512z: 0) + 7,111,768,090 cycles # 2.865 GHz + 18,489,144,559 instructions # 2.60 insn per cycle + 2.483157584 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3247) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -130,8 +130,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313572e+00 -Avg ME (F77/C++) = 4.3135722205042839 -Relative difference = 5.111872113533787e-08 +Avg ME (F77/C++) = 4.3135722697479650 +Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -141,16 +141,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.475476e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.144559e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.144559e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.190401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.626712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.626712e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.709024 sec +TOTAL : 2.111082 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,596,639,303 cycles # 2.683 GHz - 9,041,859,622 instructions # 1.97 insn per cycle - 1.714357652 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3558) (512y: 0) (512z: 0) + 5,653,716,321 cycles # 2.672 GHz + 10,852,271,306 instructions # 1.92 insn per cycle + 2.116527759 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4278) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -160,8 +160,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645687580109 -Relative difference = 9.997345323075056e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -171,16 +171,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.643193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.350005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.350005e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.201622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.641929e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.641929e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.667959 sec +TOTAL : 2.105368 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,476,334,932 cycles # 2.676 GHz - 8,532,641,638 instructions # 1.91 insn per cycle - 1.673325231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3311) (512y: 10) (512z: 0) + 5,581,069,156 cycles # 2.645 GHz + 10,551,499,792 instructions # 1.89 insn per cycle + 2.110664204 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4147) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -190,8 +190,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645687580109 -Relative difference = 9.997345323075056e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -201,16 +201,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.116930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.706081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.706081e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.153868e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.455195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.455195e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.804265 sec +TOTAL : 2.614812 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,310,050,790 cycles # 1.830 GHz - 5,957,409,151 instructions # 1.80 insn per cycle - 1.809617116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2357) (512y: 32) (512z: 2014) + 4,680,931,815 cycles # 1.787 GHz + 8,665,037,174 instructions # 1.85 insn per cycle + 2.620154476 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2929) (512y: 8) (512z: 2883) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -220,8 +220,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643783025444 -Relative difference = 8.770069111236825e-08 +Avg ME (F77/C++) = 4.3135643536224961 +Relative difference = 8.197919301304478e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 4f4847b6b6..85c03177a5 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:07:15 +DATE: 2024-06-03_18:46:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.488531e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088944e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184171e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.264249e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.082334e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184481e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.530669 sec +TOTAL : 0.532072 sec INFO: No Floating Point Exceptions have been reported - 2,163,522,979 cycles # 2.824 GHz - 3,135,447,150 instructions # 1.45 insn per cycle - 0.823757872 seconds time elapsed + 2,180,273,787 cycles # 2.817 GHz + 3,157,278,745 instructions # 1.45 insn per cycle + 0.830334061 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.700696e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.742660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.742660e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.495210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.527281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.527281e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.280235 sec +TOTAL : 7.127701 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 17,574,906,678 cycles # 2.797 GHz - 41,764,388,623 instructions # 2.38 insn per cycle - 6.285714574 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) + 20,457,240,037 cycles # 2.868 GHz + 51,952,278,015 instructions # 2.54 insn per cycle + 7.133276776 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.952464e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.091303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.091303e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.663839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.776043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.776043e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.665315 sec +TOTAL : 4.050680 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 10,271,038,566 cycles # 2.799 GHz - 26,354,751,502 instructions # 2.57 insn per cycle - 3.670825968 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2438) (avx2: 0) (512y: 0) (512z: 0) + 11,522,261,452 cycles # 2.841 GHz + 30,595,506,119 instructions # 2.66 insn per cycle + 4.056343844 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2982) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,16 +141,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.531327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.854267e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.854267e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.373632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.674233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.674233e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.428471 sec +TOTAL : 2.511484 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,507,055,363 cycles # 2.675 GHz - 12,119,284,734 instructions # 1.86 insn per cycle - 2.434107519 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2718) (512y: 0) (512z: 0) + 6,732,763,495 cycles # 2.676 GHz + 13,614,893,320 instructions # 2.02 insn per cycle + 2.516932126 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3124) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -171,16 +171,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.902513e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.278233e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.278233e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.783746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.145465e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.145465e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.252469 sec +TOTAL : 2.305308 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,021,388,550 cycles # 2.667 GHz - 11,226,998,655 instructions # 1.86 insn per cycle - 2.257985842 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2369) (512y: 150) (512z: 0) + 6,178,690,068 cycles # 2.675 GHz + 12,983,785,016 instructions # 2.10 insn per cycle + 2.310871293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2863) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -201,16 +201,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.151456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.300586e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.300586e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.991726e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.126206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.126206e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.440903 sec +TOTAL : 3.618891 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,082,239,634 cycles # 1.765 GHz - 8,214,081,257 instructions # 1.35 insn per cycle - 3.446457125 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1787) (512y: 134) (512z: 1755) + 6,406,437,804 cycles # 1.768 GHz + 8,706,435,895 instructions # 1.36 insn per cycle + 3.624513725 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1815) (512y: 134) (512z: 2012) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index a2ade5f790..6045bdf498 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:07:42 +DATE: 2024-06-03_18:46:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.485563e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095251e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.191531e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.277612e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.084295e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186968e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.530779 sec +TOTAL : 0.535350 sec INFO: No Floating Point Exceptions have been reported - 2,189,344,982 cycles # 2.825 GHz - 3,162,111,956 instructions # 1.44 insn per cycle - 0.831473815 seconds time elapsed + 2,169,547,372 cycles # 2.814 GHz + 3,115,345,457 instructions # 1.44 insn per cycle + 0.828151696 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.752561e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.796583e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.568605e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.604083e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.604083e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.097431 sec +TOTAL : 6.798758 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 17,482,218,633 cycles # 2.865 GHz - 43,049,154,317 instructions # 2.46 insn per cycle - 6.102997010 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) + 19,500,149,045 cycles # 2.866 GHz + 49,982,389,934 instructions # 2.56 insn per cycle + 6.804368339 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.172478e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332063e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.332063e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.816408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.942934e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942934e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.418913 sec +TOTAL : 3.837058 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 9,801,825,304 cycles # 2.863 GHz - 25,166,361,997 instructions # 2.57 insn per cycle - 3.424631390 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2276) (avx2: 0) (512y: 0) (512z: 0) + 10,983,586,791 cycles # 2.859 GHz + 29,101,586,876 instructions # 2.65 insn per cycle + 3.842851713 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2818) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,16 +141,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.163570e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.437386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.437386e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.607773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.808836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.808836e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.632138 sec +TOTAL : 3.019734 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,035,871,657 cycles # 2.669 GHz - 12,789,981,390 instructions # 1.82 insn per cycle - 2.637561799 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2699) (512y: 0) (512z: 0) + 8,052,880,399 cycles # 2.663 GHz + 15,178,369,602 instructions # 1.88 insn per cycle + 3.025353618 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3208) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -171,16 +171,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.487333e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.798535e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.798535e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.779624e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.999859e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.999859e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.449076 sec +TOTAL : 2.887642 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,545,009,203 cycles # 2.667 GHz - 12,105,117,349 instructions # 1.85 insn per cycle - 2.454490824 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2351) (512y: 227) (512z: 0) + 7,712,100,280 cycles # 2.667 GHz + 14,487,564,678 instructions # 1.88 insn per cycle + 2.893320746 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2786) (512y: 304) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -201,16 +201,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.973614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.106585e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.106585e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.893138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.019391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019391e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.639191 sec +TOTAL : 3.738117 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,417,633,310 cycles # 1.761 GHz - 8,985,370,621 instructions # 1.40 insn per cycle - 3.644757809 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1892) (512y: 178) (512z: 2083) + 6,563,142,303 cycles # 1.754 GHz + 9,902,891,266 instructions # 1.51 insn per cycle + 3.743637878 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1588) (512y: 220) (512z: 2216) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 3909c2de90..9658756422 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:04:42 +DATE: 2024-06-03_18:43:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.208514e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.234770e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.239030e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.191492e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.216787e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.220642e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.465857 sec +TOTAL : 0.467466 sec INFO: No Floating Point Exceptions have been reported - 1,954,339,048 cycles # 2.816 GHz - 2,841,818,682 instructions # 1.45 insn per cycle - 0.751002948 seconds time elapsed + 1,933,277,234 cycles # 2.822 GHz + 2,826,212,304 instructions # 1.46 insn per cycle + 0.744269904 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.793700e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.949156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.959741e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.846498e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.968500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.976840e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483528 sec +TOTAL : 0.480910 sec INFO: No Floating Point Exceptions have been reported - 1,990,538,703 cycles # 2.818 GHz - 2,859,145,317 instructions # 1.44 insn per cycle - 0.763660982 seconds time elapsed + 2,022,662,983 cycles # 2.822 GHz + 2,921,094,655 instructions # 1.44 insn per cycle + 0.772867570 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.333630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.336951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.336951e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.345591e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348861e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348861e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.163551 sec +TOTAL : 0.162979 sec INFO: No Floating Point Exceptions have been reported - 475,454,071 cycles # 2.848 GHz - 1,396,942,135 instructions # 2.94 insn per cycle - 0.167517734 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3991) (avx2: 0) (512y: 0) (512z: 0) + 475,099,311 cycles # 2.856 GHz + 1,396,895,904 instructions # 2.94 insn per cycle + 0.166930533 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3921) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.379547e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.391026e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.391026e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.297415e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.309092e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.309092e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.087740 sec +TOTAL : 0.088761 sec INFO: No Floating Point Exceptions have been reported - 245,466,319 cycles # 2.692 GHz - 699,170,520 instructions # 2.85 insn per cycle - 0.091863831 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9501) (avx2: 0) (512y: 0) (512z: 0) + 248,572,980 cycles # 2.693 GHz + 700,241,386 instructions # 2.82 insn per cycle + 0.092789911 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9495) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.397405e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.403101e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.403101e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.400019e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.405543e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.405543e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.042780 sec +TOTAL : 0.042745 sec INFO: No Floating Point Exceptions have been reported - 121,204,590 cycles # 2.623 GHz - 260,141,578 instructions # 2.15 insn per cycle - 0.046815365 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8227) (512y: 0) (512z: 0) + 121,819,291 cycles # 2.642 GHz + 265,166,117 instructions # 2.18 insn per cycle + 0.046685672 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8514) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.586602e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.593859e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.593859e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.572077e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.579933e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.579933e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038358 sec +TOTAL : 0.038595 sec INFO: No Floating Point Exceptions have been reported - 108,920,181 cycles # 2.610 GHz - 240,176,540 instructions # 2.21 insn per cycle - 0.042378717 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7348) (512y: 150) (512z: 0) + 110,467,988 cycles # 2.628 GHz + 247,221,317 instructions # 2.24 insn per cycle + 0.042621267 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8157) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.175182e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180276e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180276e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.155645e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160467e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.050174 sec +TOTAL : 0.050919 sec INFO: No Floating Point Exceptions have been reported - 97,067,712 cycles # 1.812 GHz - 138,415,288 instructions # 1.43 insn per cycle - 0.054229752 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1692) (512y: 126) (512z: 6592) + 98,056,565 cycles # 1.804 GHz + 141,486,289 instructions # 1.44 insn per cycle + 0.054927179 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1955) (512y: 126) (512z: 7089) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index 65eb2e6009..64cee9c3b3 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:04:52 +DATE: 2024-06-03_18:43:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.239779e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.264296e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.268375e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.233887e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.259221e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.263483e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.465734 sec +TOTAL : 0.465295 sec INFO: No Floating Point Exceptions have been reported - 1,944,767,230 cycles # 2.813 GHz - 2,830,393,799 instructions # 1.46 insn per cycle - 0.748614808 seconds time elapsed + 1,958,806,784 cycles # 2.817 GHz + 2,840,787,434 instructions # 1.45 insn per cycle + 0.751914831 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.968348e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.116174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.125970e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.956639e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.091509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.101091e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483276 sec +TOTAL : 0.486580 sec INFO: No Floating Point Exceptions have been reported - 2,014,046,726 cycles # 2.816 GHz - 2,960,547,521 instructions # 1.47 insn per cycle - 0.771364825 seconds time elapsed + 1,989,010,639 cycles # 2.812 GHz + 2,958,637,747 instructions # 1.49 insn per cycle + 0.766005331 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.352099e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.355611e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.355611e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.332669e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.336081e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.336081e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.161935 sec +TOTAL : 0.163052 sec INFO: No Floating Point Exceptions have been reported - 471,513,746 cycles # 2.852 GHz - 1,391,998,687 instructions # 2.95 insn per cycle - 0.165915651 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3869) (avx2: 0) (512y: 0) (512z: 0) + 472,941,785 cycles # 2.842 GHz + 1,392,300,118 instructions # 2.94 insn per cycle + 0.167088631 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.340411e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.352264e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.352264e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.188233e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.199153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.199153e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.087524 sec +TOTAL : 0.089766 sec INFO: No Floating Point Exceptions have been reported - 244,692,087 cycles # 2.689 GHz - 695,265,791 instructions # 2.84 insn per cycle - 0.091623255 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9537) (avx2: 0) (512y: 0) (512z: 0) + 248,021,944 cycles # 2.658 GHz + 696,332,728 instructions # 2.81 insn per cycle + 0.093962685 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9540) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.384610e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.390602e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.390602e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.381818e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387327e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387327e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.042428 sec +TOTAL : 0.042683 sec INFO: No Floating Point Exceptions have been reported - 120,402,247 cycles # 2.621 GHz - 255,771,436 instructions # 2.12 insn per cycle - 0.046482789 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8181) (512y: 0) (512z: 0) + 120,036,632 cycles # 2.612 GHz + 260,692,299 instructions # 2.17 insn per cycle + 0.046713736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8469) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.577119e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.592040e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.592040e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.560836e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.567795e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.567795e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037809 sec +TOTAL : 0.038118 sec INFO: No Floating Point Exceptions have been reported - 106,943,960 cycles # 2.590 GHz - 235,812,455 instructions # 2.21 insn per cycle - 0.041801607 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7301) (512y: 150) (512z: 0) + 108,246,886 cycles # 2.601 GHz + 242,814,438 instructions # 2.24 insn per cycle + 0.042161703 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8115) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052569e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.057306e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.057306e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.157650e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.162537e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.162537e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.054678 sec +TOTAL : 0.050130 sec INFO: No Floating Point Exceptions have been reported - 95,978,810 cycles # 1.760 GHz - 134,249,554 instructions # 1.40 insn per cycle - 0.058819108 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1641) (512y: 126) (512z: 6597) + 95,930,950 cycles # 1.791 GHz + 136,895,076 instructions # 1.43 insn per cycle + 0.054115966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1911) (512y: 126) (512z: 7093) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index a147c96b16..f6523a4ed4 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:03 +DATE: 2024-06-03_18:43:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.545160e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557368e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.560350e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 -TOTAL : 0.469580 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.441793e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.452389e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.454845e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 +TOTAL : 0.469243 sec INFO: No Floating Point Exceptions have been reported - 1,952,036,300 cycles # 2.812 GHz - 2,839,447,182 instructions # 1.45 insn per cycle - 0.751257373 seconds time elapsed + 1,955,791,300 cycles # 2.819 GHz + 2,846,567,055 instructions # 1.46 insn per cycle + 0.750731805 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.618765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.738189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.751774e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.020493e-03 +- 4.025604e-03 ) GeV^-4 -TOTAL : 0.470396 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.093405e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.186877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.197998e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.020494e-03 +- 4.025605e-03 ) GeV^-4 +TOTAL : 0.468927 sec INFO: No Floating Point Exceptions have been reported - 1,933,532,708 cycles # 2.817 GHz - 2,825,179,750 instructions # 1.46 insn per cycle - 0.744816373 seconds time elapsed + 1,947,947,883 cycles # 2.819 GHz + 2,845,586,918 instructions # 1.46 insn per cycle + 0.747736312 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272870954487585E-006 -Relative difference = 4.564329725014175e-06 +Avg ME (F77/GPU) = 8.1272869669930272E-006 +Relative difference = 4.548524165778887e-06 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.450156e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.453792e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.453792e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.365791e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.369019e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.369019e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.158288 sec +TOTAL : 0.162125 sec INFO: No Floating Point Exceptions have been reported - 461,148,667 cycles # 2.852 GHz - 1,393,475,309 instructions # 3.02 insn per cycle - 0.162260134 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3070) (avx2: 0) (512y: 0) (512z: 0) + 472,335,702 cycles # 2.852 GHz + 1,389,145,792 instructions # 2.94 insn per cycle + 0.166189548 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105211728276E-006 -Relative difference = 5.891219330978222e-08 +Avg ME (F77/C++) = 8.1278105271212486E-006 +Relative difference = 5.8180333155894157e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.199587e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.203982e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.203982e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.177410e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181512e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181512e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.048879 sec +TOTAL : 0.049884 sec INFO: No Floating Point Exceptions have been reported - 138,617,500 cycles # 2.649 GHz - 375,838,324 instructions # 2.71 insn per cycle - 0.052819431 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10134) (avx2: 0) (512y: 0) (512z: 0) + 140,700,616 cycles # 2.641 GHz + 379,285,257 instructions # 2.70 insn per cycle + 0.053843733 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:10152) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.585276e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.607346e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.607346e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.685774e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.707905e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.707905e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.025535 sec +TOTAL : 0.024610 sec INFO: No Floating Point Exceptions have been reported - 73,091,500 cycles # 2.523 GHz - 146,753,019 instructions # 2.01 insn per cycle - 0.029504478 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8933) (512y: 0) (512z: 0) + 73,040,255 cycles # 2.606 GHz + 149,958,625 instructions # 2.05 insn per cycle + 0.028633410 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9255) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.061402e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.092590e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.092590e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.962518e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.988659e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.988659e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.022191 sec +TOTAL : 0.022813 sec INFO: No Floating Point Exceptions have been reported - 67,057,729 cycles # 2.606 GHz - 136,530,201 instructions # 2.04 insn per cycle - 0.026240821 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8164) (512y: 28) (512z: 0) + 67,549,740 cycles # 2.579 GHz + 139,989,278 instructions # 2.07 insn per cycle + 0.026775785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8975) (512y: 28) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.320153e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.340741e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.340741e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278588e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.299871e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.299871e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.027976 sec +TOTAL : 0.028313 sec INFO: No Floating Point Exceptions have been reported - 59,523,378 cycles # 1.900 GHz - 85,246,359 instructions # 1.43 insn per cycle - 0.031991723 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2572) (512y: 32) (512z: 6935) + 60,108,907 cycles # 1.896 GHz + 86,712,066 instructions # 1.44 insn per cycle + 0.032360710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2837) (512y: 32) (512z: 7440) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index 6d3597262c..5ad5bc88ac 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:13 +DATE: 2024-06-03_18:44:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.556636e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.568185e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.572444e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 -TOTAL : 0.472720 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.478245e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.489517e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.493671e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 +TOTAL : 0.468633 sec INFO: No Floating Point Exceptions have been reported - 1,931,223,021 cycles # 2.806 GHz - 2,809,227,604 instructions # 1.45 insn per cycle - 0.747270154 seconds time elapsed + 1,959,695,231 cycles # 2.816 GHz + 2,840,090,818 instructions # 1.45 insn per cycle + 0.752893998 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.951614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.008354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.009763e+06 ) sec^-1 -MeanMatrixElemValue = ( 8.020495e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.469854 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.318325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.411516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.422873e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 +TOTAL : 0.471146 sec INFO: No Floating Point Exceptions have been reported - 1,934,807,792 cycles # 2.816 GHz - 2,823,952,331 instructions # 1.46 insn per cycle - 0.744383978 seconds time elapsed + 1,938,806,864 cycles # 2.813 GHz + 2,859,507,249 instructions # 1.47 insn per cycle + 0.745707568 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272870252982758E-006 -Relative difference = 4.555698209723637e-06 +Avg ME (F77/GPU) = 8.1272866419447706E-006 +Relative difference = 4.508529302013153e-06 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.440467e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.444153e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.444153e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.357321e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.360536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.360536e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.157850 sec +TOTAL : 0.161848 sec INFO: No Floating Point Exceptions have been reported - 459,198,020 cycles # 2.845 GHz - 1,388,550,014 instructions # 3.02 insn per cycle - 0.161959627 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2959) (avx2: 0) (512y: 0) (512z: 0) + 470,663,785 cycles # 2.849 GHz + 1,384,011,638 instructions # 2.94 insn per cycle + 0.165810869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2943) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105211728276E-006 -Relative difference = 5.891219330978222e-08 +Avg ME (F77/C++) = 8.1278105271212486E-006 +Relative difference = 5.8180333155894157e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.193461e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198367e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198367e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.174470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.178652e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.178652e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.048387 sec +TOTAL : 0.049176 sec INFO: No Floating Point Exceptions have been reported - 136,709,201 cycles # 2.638 GHz - 370,998,148 instructions # 2.71 insn per cycle - 0.052404685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10117) (avx2: 0) (512y: 0) (512z: 0) + 138,396,950 cycles # 2.632 GHz + 374,468,590 instructions # 2.71 insn per cycle + 0.053090230 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:10135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.706186e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.728329e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.728329e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.706805e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.730168e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.730168e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.023794 sec +TOTAL : 0.023755 sec INFO: No Floating Point Exceptions have been reported - 70,529,419 cycles # 2.593 GHz - 141,874,277 instructions # 2.01 insn per cycle - 0.027836206 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8887) (512y: 0) (512z: 0) + 70,854,023 cycles # 2.612 GHz + 145,181,313 instructions # 2.05 insn per cycle + 0.027711456 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9209) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.065896e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.094176e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.094176e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.969928e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.998045e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.998045e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.022167 sec +TOTAL : 0.022034 sec INFO: No Floating Point Exceptions have been reported - 65,090,187 cycles # 2.575 GHz - 131,753,137 instructions # 2.02 insn per cycle - 0.026221124 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8117) (512y: 28) (512z: 0) + 65,659,386 cycles # 2.581 GHz + 135,178,550 instructions # 2.06 insn per cycle + 0.026035849 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8931) (512y: 28) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.325267e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345484e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345484e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.281314e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.301558e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.301558e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.027163 sec +TOTAL : 0.027495 sec INFO: No Floating Point Exceptions have been reported - 57,486,714 cycles # 1.879 GHz - 80,476,258 instructions # 1.40 insn per cycle - 0.031188983 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 32) (512z: 6939) + 57,968,608 cycles # 1.874 GHz + 82,072,103 instructions # 1.42 insn per cycle + 0.031472874 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2792) (512y: 32) (512z: 7442) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index f1bf8ae1ae..8c3296e4df 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:23 +DATE: 2024-06-03_18:44:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.186832e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.210139e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.214000e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.182508e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.205667e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209569e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.466969 sec +TOTAL : 0.464547 sec INFO: No Floating Point Exceptions have been reported - 1,929,963,905 cycles # 2.810 GHz - 2,820,321,257 instructions # 1.46 insn per cycle - 0.744660265 seconds time elapsed + 1,952,246,686 cycles # 2.820 GHz + 2,850,005,557 instructions # 1.46 insn per cycle + 0.748916127 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.774215e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.915971e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.925293e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.811825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.935505e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.943802e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.481813 sec +TOTAL : 0.484992 sec INFO: No Floating Point Exceptions have been reported - 2,018,144,507 cycles # 2.824 GHz - 2,988,593,572 instructions # 1.48 insn per cycle - 0.771128476 seconds time elapsed + 2,003,054,759 cycles # 2.819 GHz + 2,980,865,994 instructions # 1.49 insn per cycle + 0.768589369 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.318424e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.321576e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.321576e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.311983e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.315138e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.315138e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.164181 sec +TOTAL : 0.164815 sec INFO: No Floating Point Exceptions have been reported - 478,545,974 cycles # 2.854 GHz - 1,405,298,148 instructions # 2.94 insn per cycle - 0.168196808 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3977) (avx2: 0) (512y: 0) (512z: 0) + 480,558,988 cycles # 2.855 GHz + 1,405,529,926 instructions # 2.92 insn per cycle + 0.169036832 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3912) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.576930e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588828e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588828e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.486300e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.498254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.498254e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.085353 sec +TOTAL : 0.086377 sec INFO: No Floating Point Exceptions have been reported - 242,856,261 cycles # 2.738 GHz - 691,007,271 instructions # 2.85 insn per cycle - 0.089392731 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9324) (avx2: 0) (512y: 0) (512z: 0) + 245,115,567 cycles # 2.731 GHz + 695,255,062 instructions # 2.84 insn per cycle + 0.090377006 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9339) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.350712e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.355921e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.355921e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.312936e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.317784e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.317784e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.044021 sec +TOTAL : 0.045252 sec INFO: No Floating Point Exceptions have been reported - 120,750,481 cycles # 2.554 GHz - 257,896,528 instructions # 2.14 insn per cycle - 0.048182370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8244) (512y: 0) (512z: 0) + 121,783,811 cycles # 2.496 GHz + 260,306,932 instructions # 2.14 insn per cycle + 0.049283272 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8369) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.596995e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.604840e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.604840e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.551630e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558637e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558637e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038244 sec +TOTAL : 0.039016 sec INFO: No Floating Point Exceptions have been reported - 108,668,422 cycles # 2.610 GHz - 238,349,934 instructions # 2.19 insn per cycle - 0.042362101 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7342) (512y: 146) (512z: 0) + 109,019,559 cycles # 2.565 GHz + 240,830,620 instructions # 2.21 insn per cycle + 0.043032343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7513) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.152016e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156769e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.156769e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.142604e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147315e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147315e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.051128 sec +TOTAL : 0.051566 sec INFO: No Floating Point Exceptions have been reported - 98,593,656 cycles # 1.806 GHz - 139,368,043 instructions # 1.41 insn per cycle - 0.055141464 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1953) (512y: 122) (512z: 6323) + 98,898,934 cycles # 1.798 GHz + 140,464,966 instructions # 1.42 insn per cycle + 0.055620860 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2085) (512y: 122) (512z: 6355) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index 1674ae1a31..9498116dcd 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:34 +DATE: 2024-06-03_18:44:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.211002e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.235184e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.239290e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.212183e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.236266e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.240018e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.468026 sec +TOTAL : 0.464032 sec INFO: No Floating Point Exceptions have been reported - 1,934,707,675 cycles # 2.807 GHz - 2,829,874,008 instructions # 1.46 insn per cycle - 0.746355889 seconds time elapsed + 1,950,674,091 cycles # 2.818 GHz + 2,829,648,398 instructions # 1.45 insn per cycle + 0.749030588 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.925457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.068678e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.078821e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.950257e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.072861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.081462e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.484376 sec +TOTAL : 0.483935 sec INFO: No Floating Point Exceptions have been reported - 1,990,471,362 cycles # 2.818 GHz - 2,966,958,009 instructions # 1.49 insn per cycle - 0.763465136 seconds time elapsed + 2,006,012,361 cycles # 2.809 GHz + 2,974,994,139 instructions # 1.48 insn per cycle + 0.771162547 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.320537e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.323936e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.323936e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.237548e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.240992e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.240992e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.163600 sec +TOTAL : 0.167612 sec INFO: No Floating Point Exceptions have been reported - 475,927,096 cycles # 2.852 GHz - 1,400,684,973 instructions # 2.94 insn per cycle - 0.167595349 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3871) (avx2: 0) (512y: 0) (512z: 0) + 477,099,035 cycles # 2.792 GHz + 1,400,932,341 instructions # 2.94 insn per cycle + 0.171602717 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3813) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.590632e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.602811e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.602811e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.145549e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.157756e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.157756e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.084445 sec +TOTAL : 0.090540 sec INFO: No Floating Point Exceptions have been reported - 241,578,930 cycles # 2.751 GHz - 687,384,148 instructions # 2.85 insn per cycle - 0.088448470 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9365) (avx2: 0) (512y: 0) (512z: 0) + 245,660,090 cycles # 2.600 GHz + 691,394,625 instructions # 2.81 insn per cycle + 0.095105056 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9372) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.419553e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425604e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425604e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.392384e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.397823e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.397823e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.041668 sec +TOTAL : 0.042222 sec INFO: No Floating Point Exceptions have been reported - 118,041,093 cycles # 2.620 GHz - 253,446,942 instructions # 2.15 insn per cycle - 0.045688456 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8196) (512y: 0) (512z: 0) + 119,541,895 cycles # 2.616 GHz + 255,861,520 instructions # 2.14 insn per cycle + 0.046263283 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8322) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.610064e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.617322e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.617322e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596967e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.604589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.604589e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037145 sec +TOTAL : 0.037416 sec INFO: No Floating Point Exceptions have been reported - 106,387,352 cycles # 2.623 GHz - 233,796,871 instructions # 2.20 insn per cycle - 0.041248038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7292) (512y: 146) (512z: 0) + 106,767,617 cycles # 2.615 GHz + 236,442,043 instructions # 2.21 insn per cycle + 0.041453124 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7464) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.153901e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158908e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.158908e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.147136e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152341e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.050243 sec +TOTAL : 0.050589 sec INFO: No Floating Point Exceptions have been reported - 96,297,242 cycles # 1.793 GHz - 134,709,358 instructions # 1.40 insn per cycle - 0.054323733 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 122) (512z: 6323) + 96,886,279 cycles # 1.795 GHz + 135,816,090 instructions # 1.40 insn per cycle + 0.054606545 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2036) (512y: 122) (512z: 6355) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index c46a8918fe..8e958dea3f 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:03:31 +DATE: 2024-06-03_18:42:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.623490e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.780511e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.419250e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.527999 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.162731e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.647642e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.389698e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.523677 sec INFO: No Floating Point Exceptions have been reported - 2,127,703,778 cycles # 2.804 GHz - 3,037,503,741 instructions # 1.43 insn per cycle - 0.819938887 seconds time elapsed + 2,134,696,018 cycles # 2.814 GHz + 3,031,565,143 instructions # 1.42 insn per cycle + 0.817462514 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314203117931 +Relative difference = 1.37574474202048e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.629121e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112830e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112830e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.204706 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.860735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.012248e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.012248e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.299639 sec INFO: No Floating Point Exceptions have been reported - 3,459,527,100 cycles # 2.861 GHz - 8,713,936,767 instructions # 2.52 insn per cycle - 1.210090686 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 458) (avx2: 0) (512y: 0) (512z: 0) + 3,734,089,788 cycles # 2.863 GHz + 9,713,794,615 instructions # 2.60 insn per cycle + 1.305110900 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 427) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.614799e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.138803e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.138803e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.766288 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.453119e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.863695e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.863695e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.838764 sec INFO: No Floating Point Exceptions have been reported - 2,201,462,395 cycles # 2.855 GHz - 5,464,414,082 instructions # 2.48 insn per cycle - 0.771678284 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1298) (avx2: 0) (512y: 0) (512z: 0) + 2,332,040,160 cycles # 2.765 GHz + 5,935,472,258 instructions # 2.55 insn per cycle + 0.844206086 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.243332e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350818e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350818e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.584870 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.167991e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.174056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.174056e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.601519 sec INFO: No Floating Point Exceptions have been reported - 1,605,862,312 cycles # 2.723 GHz - 3,180,962,176 instructions # 1.98 insn per cycle - 0.590347744 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 0) (512z: 0) + 1,662,054,757 cycles # 2.741 GHz + 3,319,057,632 instructions # 2.00 insn per cycle + 0.607052491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1551) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.328694e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.520640e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.520640e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.567215 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.223901e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.317323e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.317323e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.591038 sec INFO: No Floating Point Exceptions have been reported - 1,559,045,940 cycles # 2.726 GHz - 3,082,232,081 instructions # 1.98 insn per cycle - 0.572568359 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1274) (512y: 95) (512z: 0) + 1,627,445,040 cycles # 2.731 GHz + 3,290,175,146 instructions # 2.02 insn per cycle + 0.596727910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.106594e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.015389e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.015389e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.614754 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.089949e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.989851e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.989851e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.619709 sec INFO: No Floating Point Exceptions have been reported - 1,347,809,988 cycles # 2.176 GHz - 2,375,607,493 instructions # 1.76 insn per cycle - 0.620258555 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 584) (512y: 62) (512z: 953) + 1,359,372,785 cycles # 2.176 GHz + 2,427,696,314 instructions # 1.79 insn per cycle + 0.625192962 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 598) (512y: 60) (512z: 1020) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index e5d6236670..a8c2ab23a4 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:03:43 +DATE: 2024-06-03_18:42:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.746443e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.322888e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.759689e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.521311 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.277953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.135668e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.738374e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.522097 sec INFO: No Floating Point Exceptions have been reported - 2,128,407,529 cycles # 2.814 GHz - 3,052,405,520 instructions # 1.43 insn per cycle - 0.813389325 seconds time elapsed + 2,135,028,596 cycles # 2.824 GHz + 3,021,285,386 instructions # 1.42 insn per cycle + 0.815068044 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.688675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.121548e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.121548e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.196984 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.902017e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.017998e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.017998e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.294184 sec INFO: No Floating Point Exceptions have been reported - 3,435,642,752 cycles # 2.859 GHz - 8,628,896,472 instructions # 2.51 insn per cycle - 1.202405155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 403) (avx2: 0) (512y: 0) (512z: 0) + 3,718,952,494 cycles # 2.864 GHz + 9,608,091,878 instructions # 2.58 insn per cycle + 1.299745147 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637775e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.172160e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.172160e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.755876 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.491839e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.928644e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928644e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.819333 sec INFO: No Floating Point Exceptions have been reported - 2,176,531,869 cycles # 2.862 GHz - 5,398,906,105 instructions # 2.48 insn per cycle - 0.761260827 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1258) (avx2: 0) (512y: 0) (512z: 0) + 2,348,682,704 cycles # 2.850 GHz + 5,882,248,118 instructions # 2.50 insn per cycle + 0.824898694 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1355) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.236743e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.324010e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.324010e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.584681 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.158538e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.187920e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.187920e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.603487 sec INFO: No Floating Point Exceptions have been reported - 1,593,673,714 cycles # 2.704 GHz - 3,147,296,381 instructions # 1.97 insn per cycle - 0.590069472 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1386) (512y: 0) (512z: 0) + 1,667,796,633 cycles # 2.742 GHz + 3,291,585,576 instructions # 1.97 insn per cycle + 0.609046146 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.274934e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.428395e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.428395e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.577971 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.235804e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.333150e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333150e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.584918 sec INFO: No Floating Point Exceptions have been reported - 1,554,970,499 cycles # 2.667 GHz - 3,061,298,117 instructions # 1.97 insn per cycle - 0.583755416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1220) (512y: 95) (512z: 0) + 1,615,178,094 cycles # 2.739 GHz + 3,267,070,231 instructions # 2.02 insn per cycle + 0.590494284 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1394) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.106748e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.027961e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.027961e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.613683 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.105460e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.013661e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.013661e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.614015 sec INFO: No Floating Point Exceptions have been reported - 1,361,641,398 cycles # 2.201 GHz - 2,360,583,231 instructions # 1.73 insn per cycle - 0.619282503 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 557) (512y: 62) (512z: 944) + 1,365,598,832 cycles # 2.207 GHz + 2,412,888,019 instructions # 1.77 insn per cycle + 0.619396780 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 565) (512y: 60) (512z: 1006) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index e5e2512c5d..6d06ae0cff 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:03:56 +DATE: 2024-06-03_18:42:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.267825e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155988e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.256672e+09 ) sec^-1 -MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 -TOTAL : 0.482081 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.443384e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.324007e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.721983e+09 ) sec^-1 +MeanMatrixElemValue = ( 4.221160e-01 +- 1.229724e-04 ) GeV^0 +TOTAL : 0.483044 sec INFO: No Floating Point Exceptions have been reported - 2,007,174,940 cycles # 2.814 GHz - 2,862,530,986 instructions # 1.43 insn per cycle - 0.770265614 seconds time elapsed + 1,993,113,759 cycles # 2.820 GHz + 2,854,035,752 instructions # 1.43 insn per cycle + 0.765257157 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 72 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 100 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232893e-01 -Avg ME (F77/GPU) = 0.42328959883889183 -Relative difference = 7.059920764700599e-07 +Avg ME (C++/GPU) = 4.213628e-01 +Avg ME (F77/GPU) = 0.42136313809896819 +Relative difference = 8.023939659863929e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.657361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.123963e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.123963e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.176818 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.983263e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033842e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033842e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 1.258899 sec INFO: No Floating Point Exceptions have been reported - 3,379,168,439 cycles # 2.861 GHz - 8,663,925,346 instructions # 2.56 insn per cycle - 1.182131834 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 464) (avx2: 0) (512y: 0) (512z: 0) + 3,639,115,885 cycles # 2.880 GHz + 9,590,423,758 instructions # 2.64 insn per cycle + 1.264094441 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328961598104797 -Relative difference = 3.775440734888737e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136314298841171 +Relative difference = 1.0202225045153655e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.286459e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570924e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.551800 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.214375e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.385646e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.385646e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.568044 sec INFO: No Floating Point Exceptions have been reported - 1,548,820,259 cycles # 2.783 GHz - 3,686,997,614 instructions # 2.38 insn per cycle - 0.557181964 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1472) (avx2: 0) (512y: 0) (512z: 0) + 1,641,692,574 cycles # 2.868 GHz + 3,971,413,789 instructions # 2.42 insn per cycle + 0.573200191 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328960439772345 -Relative difference = 1.0389396439618597e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136313297669403 +Relative difference = 7.826194092961498e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.038982e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.429005e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.429005e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.435959 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.937735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.175366e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.175366e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.449120 sec INFO: No Floating Point Exceptions have been reported - 1,208,731,547 cycles # 2.744 GHz - 2,424,737,625 instructions # 2.01 insn per cycle - 0.441028354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1835) (512y: 0) (512z: 0) + 1,260,069,598 cycles # 2.777 GHz + 2,500,944,665 instructions # 1.98 insn per cycle + 0.454433467 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1934) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.104774e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.649197e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.649197e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.429216 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.028517e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.518592e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.518592e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.437042 sec INFO: No Floating Point Exceptions have been reported - 1,186,032,707 cycles # 2.731 GHz - 2,375,887,228 instructions # 2.00 insn per cycle - 0.434797682 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 2) (512z: 0) + 1,229,391,410 cycles # 2.784 GHz + 2,474,796,668 instructions # 2.01 insn per cycle + 0.442158464 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1885) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.873089e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.905822e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.905822e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.457639 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.850294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.803761e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.803761e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.462583 sec INFO: No Floating Point Exceptions have been reported - 1,058,281,507 cycles # 2.289 GHz - 2,045,070,071 instructions # 1.93 insn per cycle - 0.462906421 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1125) (512y: 5) (512z: 1216) + 1,074,893,018 cycles # 2.301 GHz + 2,077,590,091 instructions # 1.93 insn per cycle + 0.467776190 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1161) (512y: 5) (512z: 1289) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328957567224279 -Relative difference = 5.7473080363015266e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136310806381516 +Relative difference = 1.9137449793670585e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index ac0cd4f08e..2a4a93a29c 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:04:07 +DATE: 2024-06-03_18:42:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.274861e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.207280e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.264572e+09 ) sec^-1 -MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 -TOTAL : 0.483176 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.359186e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223148e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.693051e+09 ) sec^-1 +MeanMatrixElemValue = ( 4.221160e-01 +- 1.229724e-04 ) GeV^0 +TOTAL : 0.483764 sec INFO: No Floating Point Exceptions have been reported - 1,992,294,984 cycles # 2.814 GHz - 2,821,532,343 instructions # 1.42 insn per cycle - 0.766539932 seconds time elapsed + 2,020,475,653 cycles # 2.852 GHz + 2,883,715,422 instructions # 1.43 insn per cycle + 0.766821024 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 71 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 93 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232893e-01 -Avg ME (F77/GPU) = 0.42328960436861962 -Relative difference = 7.190557844040413e-07 +Avg ME (C++/GPU) = 4.213628e-01 +Avg ME (F77/GPU) = 0.42136314490926452 +Relative difference = 8.185565136220069e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.761895e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138382e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.138382e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.164683 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.967681e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.030786e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.030786e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 1.260175 sec INFO: No Floating Point Exceptions have been reported - 3,347,740,263 cycles # 2.864 GHz - 8,536,643,122 instructions # 2.55 insn per cycle - 1.169926925 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 372) (avx2: 0) (512y: 0) (512z: 0) + 3,623,677,233 cycles # 2.866 GHz + 9,468,922,940 instructions # 2.61 insn per cycle + 1.265433979 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 379) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328961598104797 -Relative difference = 3.775440734888737e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136314298841171 +Relative difference = 1.0202225045153655e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.377967e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.780801e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.780801e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.533084 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.136354e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.208370e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.208370e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.583400 sec INFO: No Floating Point Exceptions have been reported - 1,538,945,489 cycles # 2.861 GHz - 3,654,064,050 instructions # 2.37 insn per cycle - 0.538506706 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1417) (avx2: 0) (512y: 0) (512z: 0) + 1,648,785,142 cycles # 2.804 GHz + 3,937,247,235 instructions # 2.39 insn per cycle + 0.588593369 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1538) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328960439772345 -Relative difference = 1.0389396439618597e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136313297669403 +Relative difference = 7.826194092961498e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.061693e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.506473e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.506473e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.433720 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.943449e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.205095e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.205095e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.447831 sec INFO: No Floating Point Exceptions have been reported - 1,212,102,221 cycles # 2.764 GHz - 2,408,536,618 instructions # 1.99 insn per cycle - 0.439199966 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1739) (512y: 0) (512z: 0) + 1,255,206,933 cycles # 2.775 GHz + 2,485,414,473 instructions # 1.98 insn per cycle + 0.453037322 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1825) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.151109e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.798793e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.798793e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.423301 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.018346e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.485553e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.485553e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.437859 sec INFO: No Floating Point Exceptions have been reported - 1,183,993,010 cycles # 2.767 GHz - 2,358,621,607 instructions # 1.99 insn per cycle - 0.428753176 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1639) (512y: 2) (512z: 0) + 1,225,395,495 cycles # 2.770 GHz + 2,461,685,965 instructions # 2.01 insn per cycle + 0.442994976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1794) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.903396e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.993626e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.993626e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.452229 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.842475e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.802452e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.802452e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.460436 sec INFO: No Floating Point Exceptions have been reported - 1,057,097,121 cycles # 2.313 GHz - 2,029,647,637 instructions # 1.92 insn per cycle - 0.457646372 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1038) (512y: 5) (512z: 1206) + 1,070,780,988 cycles # 2.303 GHz + 2,061,995,899 instructions # 1.93 insn per cycle + 0.465732223 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1056) (512y: 5) (512z: 1271) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328957567224279 -Relative difference = 5.7473080363015266e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136310806381516 +Relative difference = 1.9137449793670585e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 175afd95a7..19dfb3d4f1 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:04:18 +DATE: 2024-06-03_18:43:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.031843e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.757380e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.369019e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.522857 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.378551e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.622662e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.340178e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.521109 sec INFO: No Floating Point Exceptions have been reported - 2,138,931,604 cycles # 2.823 GHz - 3,065,086,151 instructions # 1.43 insn per cycle - 0.815055642 seconds time elapsed + 2,134,508,813 cycles # 2.827 GHz + 3,062,937,081 instructions # 1.43 insn per cycle + 0.812795274 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961420809225 -Relative difference = 2.02678940084305e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314235618794 +Relative difference = 1.368031476336171e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.465539e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090638e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090638e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.223589 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.749768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.980431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.980431e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.315046 sec INFO: No Floating Point Exceptions have been reported - 3,510,333,340 cycles # 2.858 GHz - 8,780,325,111 instructions # 2.50 insn per cycle - 1.229177242 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) + 3,786,972,515 cycles # 2.869 GHz + 9,739,934,413 instructions # 2.57 insn per cycle + 1.320813753 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 427) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.643831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.197785e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.197785e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.754390 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.549537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.026486e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.026486e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.793412 sec INFO: No Floating Point Exceptions have been reported - 2,172,612,390 cycles # 2.861 GHz - 5,461,989,505 instructions # 2.51 insn per cycle - 0.760002313 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1315) (avx2: 0) (512y: 0) (512z: 0) + 2,284,858,516 cycles # 2.863 GHz + 5,921,938,395 instructions # 2.59 insn per cycle + 0.799063914 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.213379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.275644e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.275644e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.591323 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.221387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.291485e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.291485e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.589193 sec INFO: No Floating Point Exceptions have been reported - 1,584,687,799 cycles # 2.657 GHz - 3,128,023,138 instructions # 1.97 insn per cycle - 0.597064372 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1508) (512y: 0) (512z: 0) + 1,633,322,367 cycles # 2.750 GHz + 3,259,196,510 instructions # 2.00 insn per cycle + 0.594688637 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1573) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.378838e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.665981e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.665981e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.557636 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.273104e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.400962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.400962e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.578780 sec INFO: No Floating Point Exceptions have been reported - 1,515,882,226 cycles # 2.694 GHz - 2,979,109,420 instructions # 1.97 insn per cycle - 0.563386258 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1266) (512y: 104) (512z: 0) + 1,601,507,893 cycles # 2.744 GHz + 3,214,454,807 instructions # 2.01 insn per cycle + 0.584254706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1458) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.142290e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.104661e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.104661e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.607146 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.112157e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.043444e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.043444e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.615303 sec INFO: No Floating Point Exceptions have been reported - 1,330,909,045 cycles # 2.175 GHz - 2,316,396,076 instructions # 1.74 insn per cycle - 0.612769323 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 708) (512y: 64) (512z: 1000) + 1,353,304,837 cycles # 2.183 GHz + 2,382,215,008 instructions # 1.76 insn per cycle + 0.620880493 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 768) (512y: 64) (512z: 1062) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index c48f15473d..4729f1a754 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:04:30 +DATE: 2024-06-03_18:43:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.249284e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.245731e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.567694e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.522953 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.500719e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.111270e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.633026e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.522680 sec INFO: No Floating Point Exceptions have been reported - 2,139,960,069 cycles # 2.824 GHz - 3,035,488,341 instructions # 1.42 insn per cycle - 0.816127779 seconds time elapsed + 2,129,724,895 cycles # 2.817 GHz + 3,027,277,951 instructions # 1.42 insn per cycle + 0.814390574 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961420809225 -Relative difference = 2.02678940084305e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314235618794 +Relative difference = 1.368031476336171e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.565233e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.104572e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.104572e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.211740 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.796862e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.004350e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.004350e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.308348 sec INFO: No Floating Point Exceptions have been reported - 3,489,404,912 cycles # 2.868 GHz - 8,691,090,951 instructions # 2.49 insn per cycle - 1.217329641 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) + 3,764,910,824 cycles # 2.867 GHz + 9,641,101,522 instructions # 2.56 insn per cycle + 1.314135370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.593140e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.091329e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.091329e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.773508 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.486685e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.920049e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920049e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.822453 sec INFO: No Floating Point Exceptions have been reported - 2,171,763,818 cycles # 2.790 GHz - 5,395,529,961 instructions # 2.48 insn per cycle - 0.779263038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1286) (avx2: 0) (512y: 0) (512z: 0) + 2,311,474,083 cycles # 2.794 GHz + 5,864,868,802 instructions # 2.54 insn per cycle + 0.828103244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1379) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.359046e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.585537e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.585537e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.560683 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.198203e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.246567e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.246567e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.593532 sec INFO: No Floating Point Exceptions have been reported - 1,579,967,618 cycles # 2.793 GHz - 3,095,230,267 instructions # 1.96 insn per cycle - 0.566362230 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1403) (512y: 0) (512z: 0) + 1,642,446,088 cycles # 2.744 GHz + 3,222,193,167 instructions # 1.96 insn per cycle + 0.599144349 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.453478e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.809547e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.809547e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.542291 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.278954e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.413566e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.413566e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.575915 sec INFO: No Floating Point Exceptions have been reported - 1,503,606,383 cycles # 2.747 GHz - 2,961,368,670 instructions # 1.97 insn per cycle - 0.547945591 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1207) (512y: 104) (512z: 0) + 1,590,360,868 cycles # 2.738 GHz + 3,186,450,755 instructions # 2.00 insn per cycle + 0.581550909 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1394) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.154104e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.130242e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.130242e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.603301 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.120832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.060002e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.060002e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.609960 sec INFO: No Floating Point Exceptions have been reported - 1,337,530,185 cycles # 2.200 GHz - 2,301,032,773 instructions # 1.72 insn per cycle - 0.608821276 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 669) (512y: 64) (512z: 987) + 1,354,216,533 cycles # 2.203 GHz + 2,366,532,725 instructions # 1.75 insn per cycle + 0.615403806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 716) (512y: 64) (512z: 1053) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 279b0d02f4..3e4e2bc254 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:01:15 +DATE: 2024-06-03_18:39:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.298622e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163951e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277049e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.536129 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.560709e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.164481e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.284902e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.529405 sec INFO: No Floating Point Exceptions have been reported - 2,173,999,771 cycles # 2.814 GHz - 3,130,541,130 instructions # 1.44 insn per cycle - 0.830459706 seconds time elapsed + 2,182,288,380 cycles # 2.822 GHz + 3,131,212,236 instructions # 1.43 insn per cycle + 0.829981893 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795799595186 -Relative difference = 1.2987943449389332e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358666195562 +Relative difference = 6.616631711254798e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.020488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079496e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.079496e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.301250 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.781240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.827388e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827388e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.998536 sec INFO: No Floating Point Exceptions have been reported - 15,199,764,023 cycles # 2.865 GHz - 38,382,132,016 instructions # 2.53 insn per cycle - 5.306881853 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 673) (avx2: 0) (512y: 0) (512z: 0) + 17,213,253,642 cycles # 2.868 GHz + 45,933,739,384 instructions # 2.67 insn per cycle + 6.003818659 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 636) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593964 -Relative difference = 1.2987947225564713e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194407 +Relative difference = 6.616637439061751e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.456929e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.646631e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.646631e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.144938 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.100817e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.254980e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254980e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.494378 sec INFO: No Floating Point Exceptions have been reported - 9,021,281,744 cycles # 2.864 GHz - 24,583,412,308 instructions # 2.73 insn per cycle - 3.150495651 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,016,228,015 cycles # 2.866 GHz + 27,811,005,547 instructions # 2.78 insn per cycle + 3.499794029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2549) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593955 -Relative difference = 1.2987947253027805e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194411 +Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.307946e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.757750e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.757750e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.087521 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.858446e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.237280e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.237280e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.271634 sec INFO: No Floating Point Exceptions have been reported - 5,484,294,083 cycles # 2.622 GHz - 11,256,076,031 instructions # 2.05 insn per cycle - 2.093164240 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2379) (512y: 0) (512z: 0) + 6,108,605,654 cycles # 2.684 GHz + 12,591,544,338 instructions # 2.06 insn per cycle + 2.277349103 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2696) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.046359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.624997e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.624997e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 1.845597 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.187418e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.614304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.614304e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.132263 sec INFO: No Floating Point Exceptions have been reported - 4,960,542,470 cycles # 2.681 GHz - 10,562,896,493 instructions # 2.13 insn per cycle - 1.851112565 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) + 5,595,388,735 cycles # 2.618 GHz + 12,008,541,965 instructions # 2.15 insn per cycle + 2.137697029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.595965e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.792720e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.792720e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.027661 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.355157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.526988e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.526988e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.239016 sec INFO: No Floating Point Exceptions have been reported - 5,393,320,109 cycles # 1.779 GHz - 7,799,680,893 instructions # 1.45 insn per cycle - 3.033220380 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1545) + 5,757,421,005 cycles # 1.775 GHz + 8,347,756,435 instructions # 1.45 insn per cycle + 3.244654976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1486) (512y: 122) (512z: 1805) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index c0d78783de..8f4087e613 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:01:39 +DATE: 2024-06-03_18:40:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.411711e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168792e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279006e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531618 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.573649e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160966e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281775e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.527940 sec INFO: No Floating Point Exceptions have been reported - 2,183,799,831 cycles # 2.822 GHz - 3,094,825,077 instructions # 1.42 insn per cycle - 0.831170247 seconds time elapsed + 2,161,411,588 cycles # 2.822 GHz + 3,105,080,516 instructions # 1.44 insn per cycle + 0.822592967 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795799595186 -Relative difference = 1.2987943449389332e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358666195562 +Relative difference = 6.616631711254798e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.106247e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.106247e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.236847 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.833695e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.882201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882201e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.828520 sec INFO: No Floating Point Exceptions have been reported - 15,020,999,910 cycles # 2.866 GHz - 40,099,937,559 instructions # 2.67 insn per cycle - 5.242391746 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 16,713,198,070 cycles # 2.865 GHz + 44,917,694,732 instructions # 2.69 insn per cycle + 5.833774312 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593964 -Relative difference = 1.2987947225564713e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194411 +Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.600352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.806473e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.806473e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.023117 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.255035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.424858e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.424858e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.332985 sec INFO: No Floating Point Exceptions have been reported - 8,678,762,741 cycles # 2.866 GHz - 23,668,927,694 instructions # 2.73 insn per cycle - 3.028764335 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) + 9,568,995,899 cycles # 2.867 GHz + 26,692,502,057 instructions # 2.79 insn per cycle + 3.338418143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2343) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593955 -Relative difference = 1.2987947253027805e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194411 +Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.855816e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.228551e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.228551e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.270383 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.426523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.735021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.735021e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.480486 sec INFO: No Floating Point Exceptions have been reported - 6,094,093,478 cycles # 2.679 GHz - 13,059,046,457 instructions # 2.14 insn per cycle - 2.275971704 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2545) (512y: 0) (512z: 0) + 6,611,956,562 cycles # 2.661 GHz + 14,116,423,051 instructions # 2.13 insn per cycle + 2.486181855 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2780) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.110330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.520213e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.520213e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.162713 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.682757e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.023291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.023291e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.348744 sec INFO: No Floating Point Exceptions have been reported - 5,811,211,556 cycles # 2.681 GHz - 12,318,701,301 instructions # 2.12 insn per cycle - 2.168344172 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2092) (512y: 294) (512z: 0) + 6,326,699,067 cycles # 2.688 GHz + 13,709,965,055 instructions # 2.17 insn per cycle + 2.354222232 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2436) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.308089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.473010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.473010e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.279972 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.269508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.429886e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.429886e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.318234 sec INFO: No Floating Point Exceptions have been reported - 5,822,767,374 cycles # 1.773 GHz - 9,603,130,120 instructions # 1.65 insn per cycle - 3.285517019 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1970) + 5,942,109,401 cycles # 1.788 GHz + 10,106,220,045 instructions # 1.70 insn per cycle + 3.323654210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1336) (512y: 208) (512z: 1985) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 00b2a7887f..6d2dfde2c7 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:02:03 +DATE: 2024-06-03_18:40:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.781883e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.602042e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.975899e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 -TOTAL : 0.491268 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.307381e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.106656e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353255e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 +TOTAL : 0.487472 sec INFO: No Floating Point Exceptions have been reported - 2,009,507,618 cycles # 2.799 GHz - 2,900,729,651 instructions # 1.44 insn per cycle - 0.775023034 seconds time elapsed + 2,006,518,607 cycles # 2.817 GHz + 2,898,423,990 instructions # 1.44 insn per cycle + 0.769535603 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234085e+00 -Avg ME (F77/GPU) = 3.2341253389604390 -Relative difference = 1.2473067479392238e-05 +Avg ME (C++/GPU) = 2.015841e+00 +Avg ME (F77/GPU) = 2.0158787037944421 +Relative difference = 1.870375413642407e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.165672e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.236261e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.236261e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 -TOTAL : 4.931237 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.881488e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934328e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934328e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 +TOTAL : 5.662703 sec INFO: No Floating Point Exceptions have been reported - 14,146,731,336 cycles # 2.866 GHz - 38,345,680,249 instructions # 2.71 insn per cycle - 4.936728571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) + 16,249,524,467 cycles # 2.867 GHz + 45,328,757,089 instructions # 2.79 insn per cycle + 5.667894899 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340941932052374 -Relative difference = 5.974014286114415e-08 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158491701586172 +Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.834710e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.233739e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.233739e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 -TOTAL : 2.259579 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.401859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731419e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 +TOTAL : 2.473431 sec INFO: No Floating Point Exceptions have been reported - 6,488,445,171 cycles # 2.865 GHz - 15,819,901,990 instructions # 2.44 insn per cycle - 2.265166416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2693) (avx2: 0) (512y: 0) (512z: 0) + 7,083,637,480 cycles # 2.859 GHz + 17,776,736,480 instructions # 2.51 insn per cycle + 2.478699418 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234093e+00 -Avg ME (F77/C++) = 3.2340934062376618 -Relative difference = 1.2561100182708985e-07 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158486895961687 +Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.775248e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.005815e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.005815e+06 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.284026 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.050029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.137169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.137169e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.392863 sec INFO: No Floating Point Exceptions have been reported - 3,459,365,538 cycles # 2.685 GHz - 7,598,231,538 instructions # 2.20 insn per cycle - 1.289366574 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3054) (512y: 0) (512z: 0) + 3,745,294,052 cycles # 2.680 GHz + 8,268,263,382 instructions # 2.21 insn per cycle + 1.398145455 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3379) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919882990420 -Relative difference = 3.6180040581126224e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.437805e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092647e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092647e+06 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.200609 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.368203e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.560931e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.560931e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.342420 sec INFO: No Floating Point Exceptions have been reported - 3,247,417,265 cycles # 2.696 GHz - 7,207,177,396 instructions # 2.22 insn per cycle - 1.205866400 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 23) (512z: 0) + 3,561,581,907 cycles # 2.644 GHz + 7,923,251,903 instructions # 2.22 insn per cycle + 1.347707221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3231) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919882990420 -Relative difference = 3.6180040581126224e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.751112e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.477606e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.477606e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.642510 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.251505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.866721e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.866721e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.767422 sec INFO: No Floating Point Exceptions have been reported - 3,066,183,622 cycles # 1.862 GHz - 5,839,500,735 instructions # 1.90 insn per cycle - 1.647870341 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2375) (512y: 24) (512z: 1889) + 3,256,427,300 cycles # 1.838 GHz + 6,105,183,383 instructions # 1.87 insn per cycle + 1.772647081 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2407) (512y: 24) (512z: 2153) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340921289287508 -Relative difference = 3.986551736519174e-08 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158476348733529 +Relative difference = 1.8112806478434436e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index 2e0a99a1cf..ccfedd0706 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:02:23 +DATE: 2024-06-03_18:41:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.478321e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.704119e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.049758e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 -TOTAL : 0.485405 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.045625e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.457120e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.724518e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 +TOTAL : 0.481413 sec INFO: No Floating Point Exceptions have been reported - 2,012,796,173 cycles # 2.826 GHz - 2,903,006,317 instructions # 1.44 insn per cycle - 0.768724930 seconds time elapsed + 1,998,175,303 cycles # 2.818 GHz + 2,885,226,643 instructions # 1.44 insn per cycle + 0.764831178 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234085e+00 -Avg ME (F77/GPU) = 3.2341253389604390 -Relative difference = 1.2473067479392238e-05 +Avg ME (C++/GPU) = 2.015841e+00 +Avg ME (F77/GPU) = 2.0158787037944421 +Relative difference = 1.870375413642407e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.137053e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.205479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.205479e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 -TOTAL : 4.995257 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.911474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.965887e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.965887e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 +TOTAL : 5.573857 sec INFO: No Floating Point Exceptions have been reported - 14,321,719,125 cycles # 2.865 GHz - 39,835,690,494 instructions # 2.78 insn per cycle - 5.000617134 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) + 15,991,216,430 cycles # 2.867 GHz + 44,432,971,909 instructions # 2.78 insn per cycle + 5.578945319 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340941675938666 -Relative difference = 5.182096339328524e-08 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158491701586172 +Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.647900e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.198922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.198922e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 -TOTAL : 1.945695 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.002910e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.429530e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.429530e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 +TOTAL : 2.186306 sec INFO: No Floating Point Exceptions have been reported - 5,584,746,201 cycles # 2.864 GHz - 15,284,426,800 instructions # 2.74 insn per cycle - 1.951180487 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2473) (avx2: 0) (512y: 0) (512z: 0) + 6,066,277,847 cycles # 2.770 GHz + 17,078,408,150 instructions # 2.82 insn per cycle + 2.191394262 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2881) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234093e+00 -Avg ME (F77/C++) = 3.2340934062376618 -Relative difference = 1.2561100182708985e-07 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158486895961687 +Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.237642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.855975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.855975e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.769454 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.883569e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.447155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.447155e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.870818 sec INFO: No Floating Point Exceptions have been reported - 4,749,170,648 cycles # 2.677 GHz - 9,734,022,428 instructions # 2.05 insn per cycle - 1.774782238 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3707) (512y: 0) (512z: 0) + 5,034,265,873 cycles # 2.685 GHz + 10,228,613,656 instructions # 2.03 insn per cycle + 1.876045036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3916) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919817797840 -Relative difference = 5.633796441974414e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.407642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.062522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.062522e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.724353 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.939491e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.517106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.517106e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.853794 sec INFO: No Floating Point Exceptions have been reported - 4,623,692,148 cycles # 2.674 GHz - 9,324,388,930 instructions # 2.02 insn per cycle - 1.729730077 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3495) (512y: 0) (512z: 0) + 4,981,257,343 cycles # 2.681 GHz + 9,997,702,736 instructions # 2.01 insn per cycle + 1.858959303 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3823) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919817797840 -Relative difference = 5.633796441974414e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.457416e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.921892e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.921892e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 2.009945 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.481243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.789058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.789058e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 2.429244 sec INFO: No Floating Point Exceptions have been reported - 3,661,033,798 cycles # 1.818 GHz - 7,034,840,971 instructions # 1.92 insn per cycle - 2.015460084 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 12) (512z: 2220) + 4,364,624,594 cycles # 1.794 GHz + 8,448,218,621 instructions # 1.94 insn per cycle + 2.434426241 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2895) (512y: 4) (512z: 2751) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340921270661056 -Relative difference = 3.928957668408837e-08 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158476348733529 +Relative difference = 1.8112806478434436e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index ea5a9dfe42..cf858f4377 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:02:44 +DATE: 2024-06-03_18:41:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.411466e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167511e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278097e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531668 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.593015e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.162361e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.282560e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.529138 sec INFO: No Floating Point Exceptions have been reported - 2,181,212,654 cycles # 2.815 GHz - 3,113,262,519 instructions # 1.43 insn per cycle - 0.831954647 seconds time elapsed + 2,189,111,228 cycles # 2.820 GHz + 3,106,086,555 instructions # 1.42 insn per cycle + 0.833333259 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795839181666 -Relative difference = 1.2865539301192385e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358639104246 +Relative difference = 6.751024171044779e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.011160e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.069301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.069301e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.324881 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.754698e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799357e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 6.087390 sec INFO: No Floating Point Exceptions have been reported - 15,270,464,570 cycles # 2.866 GHz - 38,583,585,562 instructions # 2.53 insn per cycle - 5.330529959 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 677) (avx2: 0) (512y: 0) (512z: 0) + 17,437,223,674 cycles # 2.863 GHz + 46,088,336,844 instructions # 2.64 insn per cycle + 6.092892306 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 636) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.489143e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.682455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.682455e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.116860 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.122318e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279676e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.470120 sec INFO: No Floating Point Exceptions have been reported - 8,946,095,501 cycles # 2.866 GHz - 24,230,074,231 instructions # 2.71 insn per cycle - 3.122402700 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) + 9,965,552,209 cycles # 2.868 GHz + 27,601,401,595 instructions # 2.77 insn per cycle + 3.475667653 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2593) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.511600e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.996636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.996636e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.016820 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.892219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.272346e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.272346e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.255972 sec INFO: No Floating Point Exceptions have been reported - 5,398,022,181 cycles # 2.671 GHz - 11,281,135,154 instructions # 2.09 insn per cycle - 2.022588826 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2483) (512y: 0) (512z: 0) + 6,032,808,300 cycles # 2.668 GHz + 12,495,305,658 instructions # 2.07 insn per cycle + 2.261593430 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2783) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.136939e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.737334e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.737334e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 1.820797 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.389880e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.851032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.851032e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.057011 sec INFO: No Floating Point Exceptions have been reported - 4,868,000,366 cycles # 2.667 GHz - 10,530,833,141 instructions # 2.16 insn per cycle - 1.826342910 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2170) (512y: 148) (512z: 0) + 5,514,973,271 cycles # 2.675 GHz + 11,929,839,957 instructions # 2.16 insn per cycle + 2.062497201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2534) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.738166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.952895e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.952895e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.917099 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.457890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.639313e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639313e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.145292 sec INFO: No Floating Point Exceptions have been reported - 5,206,834,374 cycles # 1.782 GHz - 7,607,869,413 instructions # 1.46 insn per cycle - 2.922673764 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1611) + 5,590,550,000 cycles # 1.775 GHz + 8,120,275,403 instructions # 1.45 insn per cycle + 3.151215102 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1668) (512y: 126) (512z: 1865) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 611ee95bf5..1ec6d6d579 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:03:07 +DATE: 2024-06-03_18:41:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.372214e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.166063e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277281e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531410 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.483040e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.148416e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265278e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.530656 sec INFO: No Floating Point Exceptions have been reported - 2,165,210,077 cycles # 2.821 GHz - 3,108,583,253 instructions # 1.44 insn per cycle - 0.824896732 seconds time elapsed + 2,180,510,722 cycles # 2.824 GHz + 3,126,960,490 instructions # 1.43 insn per cycle + 0.829636549 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795839181666 -Relative difference = 1.2865539301192385e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358639104246 +Relative difference = 6.751024171044779e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.998905e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.057195e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057195e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.355715 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.808177e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.855648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855648e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.910250 sec INFO: No Floating Point Exceptions have been reported - 15,350,187,703 cycles # 2.864 GHz - 40,368,332,178 instructions # 2.63 insn per cycle - 5.361266689 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 16,954,887,617 cycles # 2.866 GHz + 45,103,327,044 instructions # 2.66 insn per cycle + 5.915938477 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 581) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.655753e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.869389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.869389e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.979107 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.172628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.333065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333065e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.416788 sec INFO: No Floating Point Exceptions have been reported - 8,538,137,981 cycles # 2.862 GHz - 23,251,495,548 instructions # 2.72 insn per cycle - 2.984645572 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2090) (avx2: 0) (512y: 0) (512z: 0) + 9,502,201,673 cycles # 2.777 GHz + 26,246,195,465 instructions # 2.76 insn per cycle + 3.422433352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2397) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.687752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.034396e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.034396e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.348055 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.374754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.674975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.674975e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.508984 sec INFO: No Floating Point Exceptions have been reported - 6,251,107,015 cycles # 2.657 GHz - 12,960,902,963 instructions # 2.07 insn per cycle - 2.353740392 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2668) (512y: 0) (512z: 0) + 6,734,505,509 cycles # 2.680 GHz + 14,036,419,832 instructions # 2.08 insn per cycle + 2.514518815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.964987e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.353858e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.353858e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.222113 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.620511e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.957323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.957323e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.379950 sec INFO: No Floating Point Exceptions have been reported - 5,918,688,699 cycles # 2.658 GHz - 12,237,201,089 instructions # 2.07 insn per cycle - 2.227737714 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2208) (512y: 296) (512z: 0) + 6,387,248,616 cycles # 2.678 GHz + 13,522,465,773 instructions # 2.12 insn per cycle + 2.385413757 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2543) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.434413e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.612405e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.612405e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.164008 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.449512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.630105e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630105e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.151083 sec INFO: No Floating Point Exceptions have been reported - 5,604,141,468 cycles # 1.769 GHz - 8,744,053,502 instructions # 1.56 insn per cycle - 3.169616891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1908) + 5,596,390,973 cycles # 1.774 GHz + 9,216,406,251 instructions # 1.65 insn per cycle + 3.156653341 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1453) (512y: 212) (512z: 2058) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= From 08f681b63474433b2d8a8edc0863a8107b470189 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 4 Jun 2024 00:01:18 +0200 Subject: [PATCH 24/33] [tmad] rerun again 30 tmad tests on itscrd90 - all as expected (failures in heft #833, susy #826 and ggttgg #856 - but susy #825 is fixed) STARTED AT Mon Jun 3 07:13:33 PM CEST 2024 (SM tests) ENDED(1) AT Mon Jun 3 11:45:17 PM CEST 2024 [Status=0] (BSM tests) ENDED(1) AT Mon Jun 3 11:55:27 PM CEST 2024 [Status=0] 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt 1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt 0 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 126 ++--- .../log_eemumu_mad_f_inl0_hrd0.txt | 130 ++--- .../log_eemumu_mad_m_inl0_hrd0.txt | 134 ++--- .../log_ggtt_mad_d_inl0_hrd0.txt | 162 +++--- .../log_ggtt_mad_f_inl0_hrd0.txt | 180 +++---- .../log_ggtt_mad_m_inl0_hrd0.txt | 132 ++--- .../log_ggttg_mad_d_inl0_hrd0.txt | 134 ++--- .../log_ggttg_mad_f_inl0_hrd0.txt | 178 +++---- .../log_ggttg_mad_m_inl0_hrd0.txt | 124 ++--- .../log_ggttgg_mad_d_inl0_hrd0.txt | 34 +- .../log_ggttgg_mad_f_inl0_hrd0.txt | 36 +- .../log_ggttgg_mad_m_inl0_hrd0.txt | 32 +- .../log_ggttggg_mad_d_inl0_hrd0.txt | 134 ++--- .../log_ggttggg_mad_f_inl0_hrd0.txt | 182 +++---- .../log_ggttggg_mad_m_inl0_hrd0.txt | 132 ++--- .../log_gqttq_mad_d_inl0_hrd0.txt | 138 ++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 180 +++---- .../log_gqttq_mad_m_inl0_hrd0.txt | 174 +++--- .../log_heftggbb_mad_d_inl0_hrd0.txt | 150 +++--- .../log_heftggbb_mad_f_inl0_hrd0.txt | 32 +- .../log_heftggbb_mad_m_inl0_hrd0.txt | 148 +++--- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 134 ++--- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 150 +++--- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 132 ++--- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 22 +- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 26 +- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 26 +- .../log_susyggtt_mad_d_inl0_hrd0.txt | 497 +++++++++++++++++- .../log_susyggtt_mad_f_inl0_hrd0.txt | 497 +++++++++++++++++- .../log_susyggtt_mad_m_inl0_hrd0.txt | 497 +++++++++++++++++- 30 files changed, 3018 insertions(+), 1635 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 2606511c5e..9e59cc6053 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum - make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:10:35 +DATE: 2024-06-03_19:13:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7366s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7280s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7249s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.45E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,8 +83,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1894s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1807s + [COUNTERS] PROGRAM TOTAL : 0.1897s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1811s [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.47E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3993s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3064s - [COUNTERS] Fortran MEs ( 1 ) : 0.0929s for 90112 events => throughput is 9.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3979s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3046s + [COUNTERS] Fortran MEs ( 1 ) : 0.0933s for 90112 events => throughput is 9.66E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1957s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1888s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1969s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1895s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3886s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0768s for 90112 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3924s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0805s for 90112 events => throughput is 1.12E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.221697e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144905e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.224133e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.166697e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1903s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1859s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1897s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1853s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 8192 events => throughput is 1.86E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3675s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3196s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0478s for 90112 events => throughput is 1.88E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3591s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3101s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0490s for 90112 events => throughput is 1.84E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.934007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894279e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.891822e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934324e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1871s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1838s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.49E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1877s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.47E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3454s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0367s for 90112 events => throughput is 2.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3453s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 90112 events => throughput is 2.45E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.401919e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.465927e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.574310e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577145e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1876s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1843s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.47E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3092s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 90112 events => throughput is 2.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3463s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3102s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0362s for 90112 events => throughput is 2.49E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.678445e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.462815e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.730941e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.650299e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1849s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 2.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1897s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1856s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.99E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,8 +470,8 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3584s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3136s + [COUNTERS] PROGRAM TOTAL : 0.3567s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0448s for 90112 events => throughput is 2.01E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.084448e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.064495e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211439e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.168911e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.6089s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6084s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.59E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6144s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6139s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.62E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7381s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7330s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0051s for 90112 events => throughput is 1.78E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7431s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7382s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.82E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.956984e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.136979e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.914466e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920255e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.022770e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.143530e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.442801e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.460339e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.975645e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109323e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.864611e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.972398e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.017368e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119337e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.144056e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.144467e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index e035800d31..9bfddda4fe 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:10:51 +DATE: 2024-06-03_19:14:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7341s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7255s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7324s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7239s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1884s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1797s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1879s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1792s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.43E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,8 +108,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3994s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3066s + [COUNTERS] PROGRAM TOTAL : 0.3982s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3054s [COUNTERS] Fortran MEs ( 1 ) : 0.0928s for 90112 events => throughput is 9.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1917s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1849s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1924s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3880s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0737s for 90112 events => throughput is 1.22E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3886s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3120s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0766s for 90112 events => throughput is 1.18E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.262067e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220259e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.264285e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.173712e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1837s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1810s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1840s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1813s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.01E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3356s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3059s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0296s for 90112 events => throughput is 3.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3374s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3077s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0297s for 90112 events => throughput is 3.03E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.173804e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.033669e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.178187e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186398e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1861s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1836s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1867s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1842s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.28E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,8 +318,8 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3359s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3083s + [COUNTERS] PROGRAM TOTAL : 0.3362s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3085s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0276s for 90112 events => throughput is 3.26E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.322945e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.258347e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.473480e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.603900e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1868s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1843s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1872s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.37E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3366s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3092s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0274s for 90112 events => throughput is 3.29E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 90112 events => throughput is 3.30E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.219490e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276103e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.481368e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299422e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1847s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.00E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3400s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3111s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0290s for 90112 events => throughput is 3.11E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3417s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3126s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0291s for 90112 events => throughput is 3.10E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.352002e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299333e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.429631e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.595053e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.6098s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6147s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6142s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.70E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7372s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7325s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.93E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7444s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7397s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.92E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.470249e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.721178e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847509e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.721504e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.381501e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.332382e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.048976e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791017e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.378693e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.411393e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.245527e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.090571e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.733078e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.740893e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.449608e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.788549e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index f4bdf77873..0d5022e324 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -14,8 +14,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:11:07 +DATE: 2024-06-03_19:14:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7352s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7265s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7357s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7271s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1891s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1804s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3984s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3056s - [COUNTERS] Fortran MEs ( 1 ) : 0.0928s for 90112 events => throughput is 9.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3946s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3021s + [COUNTERS] Fortran MEs ( 1 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.2031s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1958s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1962s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1887s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3908s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3121s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0787s for 90112 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3949s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3125s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0824s for 90112 events => throughput is 1.09E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.180947e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055902e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.192933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144724e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1888s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1846s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1897s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.91E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3558s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3096s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 90112 events => throughput is 1.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3575s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3103s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0472s for 90112 events => throughput is 1.91E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.003834e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.950984e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.101928e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.054562e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1830s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1878s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1844s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.45E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3474s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3105s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 90112 events => throughput is 2.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3468s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3091s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0376s for 90112 events => throughput is 2.40E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.570225e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.366259e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.646601e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.465632e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.56E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1885s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1852s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.51E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3437s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0349s for 90112 events => throughput is 2.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3474s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0362s for 90112 events => throughput is 2.49E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.503351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.427741e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.664209e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.608097e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1890s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1851s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1882s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1843s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.10E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3556s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3129s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 90112 events => throughput is 2.11E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3560s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0430s for 90112 events => throughput is 2.09E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.068822e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.200700e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.271696e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.273677e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.6088s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6083s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6163s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7369s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7319s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 90112 events => throughput is 1.81E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7402s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7352s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 90112 events => throughput is 1.80E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.813912e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.241160e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.921066e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894323e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.979130e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.098589e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.509503e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.500179e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.019788e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.141493e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.083255e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.064419e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.995167e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.123437e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.161218e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.162393e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index df1862c6e5..660fbe8faf 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -3,8 +3,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:11:23 +DATE: 2024-06-03_19:14:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7874s - [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7902s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4145s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3707s - [COUNTERS] Fortran MEs ( 1 ) : 0.0437s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3722s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7613s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2812s - [COUNTERS] Fortran MEs ( 1 ) : 0.4801s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2880s + [COUNTERS] Fortran MEs ( 1 ) : 0.4818s for 90112 events => throughput is 1.87E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4581s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 8192 events => throughput is 2.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4699s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4240s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0459s for 8192 events => throughput is 1.79E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -164,15 +164,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.8094s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3736s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4358s for 90112 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9007s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3945s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5062s for 90112 events => throughput is 1.78E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989099) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.124003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822831e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.123725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.828898e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -207,15 +207,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4185s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3956s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4309s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4052s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 8192 events => throughput is 3.18E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6287s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3727s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2560s for 90112 events => throughput is 3.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6584s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3704s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2880s for 90112 events => throughput is 3.13E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.603219e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.236574e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.608626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.236820e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -283,15 +283,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4087s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4129s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3968s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 8192 events => throughput is 5.08E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -316,15 +316,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989135] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5189s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3601s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1588s for 90112 events => throughput is 5.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5429s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3649s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1780s for 90112 events => throughput is 5.06E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989135) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.698027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.004607e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.833893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.981435e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -359,15 +359,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4150s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4000s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0150s for 8192 events => throughput is 5.47E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -392,15 +392,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989135] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5009s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3580s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1429s for 90112 events => throughput is 6.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5242s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1630s for 90112 events => throughput is 5.53E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989135) differ by less than 3E-14 (4.440892098500626e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.422941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.503100e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.567554e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.414067e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -435,15 +435,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4251s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4029s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0222s for 8192 events => throughput is 3.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4259s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4024s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -468,15 +468,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989135] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6212s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3792s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2420s for 90112 events => throughput is 3.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6343s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3748s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2595s for 90112 events => throughput is 3.47E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989135) differ by less than 3E-14 (4.440892098500626e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.780392e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.552498e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.785329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569972e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.8191s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8185s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8206s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8200s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.44E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7827s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7760s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7909s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7841s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 90112 events => throughput is 1.33E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.043198e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.090420e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.609307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.630550e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.187148e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265029e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.081029e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.074243e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.178903e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.247705e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.154384e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154009e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.194807e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276487e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.086257e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.074034e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 40923b92a9..86c8571c7c 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -13,8 +13,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -22,9 +22,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:11:50 +DATE: 2024-06-03_19:15:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7812s - [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8259s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7819s + [COUNTERS] Fortran MEs ( 1 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4161s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s - [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4174s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3732s + [COUNTERS] Fortran MEs ( 1 ) : 0.0442s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7730s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2919s - [COUNTERS] Fortran MEs ( 1 ) : 0.4810s for 90112 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2920s + [COUNTERS] Fortran MEs ( 1 ) : 0.4840s for 90112 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094179692708323] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4533s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4157s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0375s for 8192 events => throughput is 2.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4629s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4198s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094179780921394) differ by less than 4E-4 (1.0665510541407741e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094179692708323) differ by less than 4E-4 (1.0852822573959031e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -164,15 +164,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105688388783328] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.8031s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3895s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4136s for 90112 events => throughput is 2.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8654s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3914s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4739s for 90112 events => throughput is 1.90E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ by less than 4E-4 (1.4224799227413598e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105688388783328) differ by less than 4E-4 (1.462924120732012e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.242461e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.943677e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.262275e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.913714e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -207,15 +207,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094175707109216] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4099s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0158s for 8192 events => throughput is 5.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3947s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0175s for 8192 events => throughput is 4.68E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094175850060040) differ by less than 4E-4 (1.9012318908107062e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094175707109216) differ by less than 4E-4 (1.9315861321533845e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -240,15 +240,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684583433771] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5387s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3642s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1745s for 90112 events => throughput is 5.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5567s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3640s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1927s for 90112 events => throughput is 4.68E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ by less than 4E-4 (2.2324275217311396e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684583433771) differ by less than 4E-4 (2.2707562807866566e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.184177e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.658213e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.248092e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.757866e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -283,15 +283,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094173726920275] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3964s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3955s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3863s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0092s for 8192 events => throughput is 8.93E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094173726920275) differ by less than 4E-4 (2.3520603253945893e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -316,15 +316,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684037363524] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4481s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3541s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0939s for 90112 events => throughput is 9.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4583s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3548s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1035s for 90112 events => throughput is 8.70E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684037363524) differ by less than 4E-4 (2.386680745258829e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.679138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.749854e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.562229e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.538146e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -359,15 +359,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094173726920275] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4002s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3972s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3883s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0089s for 8192 events => throughput is 9.22E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094173726920275) differ by less than 4E-4 (2.3520603253945893e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -392,15 +392,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684037363524] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4436s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3562s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4605s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3596s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1010s for 90112 events => throughput is 8.92E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684037363524) differ by less than 4E-4 (2.386680745258829e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.034170e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.142616e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.073727e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.281160e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -435,15 +435,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094178448427996] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4028s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0117s for 8192 events => throughput is 7.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4026s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3901s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.57E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094178213275804) differ by less than 4E-4 (1.3994256109484127e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094178448427996) differ by less than 4E-4 (1.3494932904478674e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -468,15 +468,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105688391432061] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4886s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3609s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1277s for 90112 events => throughput is 7.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4961s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3593s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1368s for 90112 events => throughput is 6.59E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ by less than 4E-4 (1.4588574703822133e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105688391432061) differ by less than 4E-4 (1.462361824966507e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.325091e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.813125e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.420865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.875027e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -511,15 +511,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184162782994] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.8132s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8126s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8152s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8147s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cuda (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) +OK! xsec from fortran (47.094184803756640) and cuda (47.094184162782994) differ by less than 4E-4 (1.3610462645807786e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -544,15 +544,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105694501043516] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 2.0342s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0284s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0058s for 90112 events => throughput is 1.55E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7910s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7851s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.54E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cuda (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) +OK! xsec from fortran (47.105695279989114) and cuda (47.105694501043516) differ by less than 4E-4 (1.6536123581545326e-08) *** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.201338e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.195980e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.880181e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.556275e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.110951e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.786203e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.785133e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.463457e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.085768e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.802991e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.869688e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.550775e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.657984e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.392300e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.441526e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.521872e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 94bbdb8240..f3afa24d44 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx - make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone + + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:12:17 +DATE: 2024-06-03_19:15:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8280s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7840s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7847s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4164s + [COUNTERS] PROGRAM TOTAL : 0.4163s [COUNTERS] Fortran Overhead ( 0 ) : 0.3725s - [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7705s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2895s - [COUNTERS] Fortran MEs ( 1 ) : 0.4809s for 90112 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7755s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2925s + [COUNTERS] Fortran MEs ( 1 ) : 0.4830s for 90112 events => throughput is 1.87E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4599s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0405s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4718s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4255s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0464s for 8192 events => throughput is 1.77E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.8301s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3854s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4447s for 90112 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9103s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3987s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5116s for 90112 events => throughput is 1.76E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.068435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804065e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.071645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.811315e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3999s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 8192 events => throughput is 3.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4289s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4036s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 8192 events => throughput is 3.24E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6239s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3717s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2522s for 90112 events => throughput is 3.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6962s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2789s for 90112 events => throughput is 3.23E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.641241e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.250423e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.667711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264215e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4096s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3936s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.14E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5203s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3637s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1566s for 90112 events => throughput is 5.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5412s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3619s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1793s for 90112 events => throughput is 5.03E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.707638e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.016563e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.808160e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.972402e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4034s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3908s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0127s for 8192 events => throughput is 6.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4074s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3928s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0146s for 8192 events => throughput is 5.60E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5038s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3639s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1399s for 90112 events => throughput is 6.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5203s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3593s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1610s for 90112 events => throughput is 5.60E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.443112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.586422e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.548414e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.605389e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4017s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0232s for 8192 events => throughput is 3.53E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6084s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3746s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2339s for 90112 events => throughput is 3.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6374s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3830s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2544s for 90112 events => throughput is 3.54E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.920724e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.593888e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.976399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.629688e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.8145s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8139s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8166s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8160s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7903s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7836s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7894s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.35E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.015619e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.100491e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.591542e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.637215e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.178918e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269188e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.065499e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069119e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.176442e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265612e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.149411e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.145939e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.170597e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.255728e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.084870e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.993627e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 744f7cd9e1..146533aa10 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:12:44 +DATE: 2024-06-03_19:16:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.7183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3761s - [COUNTERS] Fortran MEs ( 1 ) : 0.3421s for 8192 events => throughput is 2.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7142s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3724s + [COUNTERS] Fortran MEs ( 1 ) : 0.3417s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6762s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3345s - [COUNTERS] Fortran MEs ( 1 ) : 0.3417s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6753s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3338s + [COUNTERS] Fortran MEs ( 1 ) : 0.3416s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3647s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6103s - [COUNTERS] Fortran MEs ( 1 ) : 3.7545s for 90112 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3486s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6022s + [COUNTERS] Fortran MEs ( 1 ) : 3.7464s for 90112 events => throughput is 2.41E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 1.0244s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3493s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6765s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3561s for 8192 events => throughput is 2.30E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.7825s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9343s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8482s for 90112 events => throughput is 2.34E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.8429s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9359s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.9070s for 90112 events => throughput is 2.31E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.407027e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.383395e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.413486e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.394045e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5112s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1814s for 8192 events => throughput is 4.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7001s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5144s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1857s for 8192 events => throughput is 4.41E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.7969s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7715s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.0254s for 90112 events => throughput is 4.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.8246s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7745s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.0500s for 90112 events => throughput is 4.40E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.630694e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.523279e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.633592e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.523274e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5188s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4270s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0917s for 8192 events => throughput is 8.93E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5180s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4248s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0931s for 8192 events => throughput is 8.80E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6871s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6808s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0063s for 90112 events => throughput is 8.96E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6941s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6721s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0220s for 90112 events => throughput is 8.82E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.164525e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.824242e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.217748e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.885307e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4964s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4148s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0816s for 8192 events => throughput is 1.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4967s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4133s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0834s for 8192 events => throughput is 9.83E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5733s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6744s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8989s for 90112 events => throughput is 1.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5780s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6577s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9203s for 90112 events => throughput is 9.79E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.034671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.010185e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.030845e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.015851e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4520s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1183s for 8192 events => throughput is 6.92E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5684s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4492s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1192s for 8192 events => throughput is 6.87E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.0127s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7123s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3005s for 90112 events => throughput is 6.93E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.0011s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7007s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3004s for 90112 events => throughput is 6.93E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.991688e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.959669e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.017662e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.959730e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7773s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7719s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7829s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7774s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0409s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 90112 events => throughput is 3.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0463s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0233s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 90112 events => throughput is 3.92E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.631318e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.640011e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.120129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.164795e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.935888e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.949085e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244627e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245266e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.951586e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.984443e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.254745e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.255624e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.930773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.966094e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.775392e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.773295e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index ed5f9117a3..5b2db5e97c 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,9 +1,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:13:28 +DATE: 2024-06-03_19:16:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.7143s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3729s - [COUNTERS] Fortran MEs ( 1 ) : 0.3414s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7110s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3705s + [COUNTERS] Fortran MEs ( 1 ) : 0.3405s for 8192 events => throughput is 2.41E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6777s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3354s - [COUNTERS] Fortran MEs ( 1 ) : 0.3423s for 8192 events => throughput is 2.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3402s + [COUNTERS] Fortran MEs ( 1 ) : 0.3403s for 8192 events => throughput is 2.41E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3614s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6067s - [COUNTERS] Fortran MEs ( 1 ) : 3.7546s for 90112 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3556s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6074s + [COUNTERS] Fortran MEs ( 1 ) : 3.7482s for 90112 events => throughput is 2.40E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112722616246457] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9951s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6578s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3372s for 8192 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6671s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3435s for 8192 events => throughput is 2.38E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722621426752) differ by less than 4E-4 (2.569659680817793e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722616246457) differ by less than 4E-4 (2.570171934723753e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -164,15 +164,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238468293717765E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.6208s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9132s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7075s for 90112 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.6804s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9216s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7588s for 90112 events => throughput is 2.40E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002) differ by less than 4E-4 (1.719182115555995e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468293717765E-002) differ by less than 4E-4 (1.721259623721494e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.551463e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.472425e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.494483e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.477772e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -207,15 +207,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112720694019242] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5327s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4314s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1013s for 8192 events => throughput is 8.09E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5374s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1042s for 8192 events => throughput is 7.86E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720710186394) differ by less than 4E-4 (2.758652844936371e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720694019242) differ by less than 4E-4 (2.760251535116609e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -240,15 +240,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238454783817719E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.8030s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6873s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1157s for 90112 events => throughput is 8.08E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.8486s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6901s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1585s for 90112 events => throughput is 7.78E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002) differ by less than 4E-4 (3.4258681169685445e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454783817719E-002) differ by less than 4E-4 (3.4262266690454624e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.275405e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.054064e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.220360e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.029700e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -283,15 +283,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112721757974454] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3800s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0469s for 8192 events => throughput is 1.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4376s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3898s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0477s for 8192 events => throughput is 1.72E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721757974454) differ by less than 4E-4 (2.655042234289695e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -316,15 +316,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238453732924513E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.1488s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6302s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5185s for 90112 events => throughput is 1.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1586s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6309s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5276s for 90112 events => throughput is 1.71E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453732924513E-002) differ by less than 4E-4 (3.558850765195132e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.780438e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.736136e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.775796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.706286e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -359,15 +359,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112721757974454] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4158s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3735s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4187s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3752s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0436s for 8192 events => throughput is 1.88E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721757974454) differ by less than 4E-4 (2.655042234289695e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -392,15 +392,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238453732924513E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.1028s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6328s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4700s for 90112 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1150s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4820s for 90112 events => throughput is 1.87E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453732924513E-002) differ by less than 4E-4 (3.558850765195132e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.959673e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.901200e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.972854e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.890782e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -435,15 +435,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112723389095883] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4476s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3902s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0574s for 8192 events => throughput is 1.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4478s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3898s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0580s for 8192 events => throughput is 1.41E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723387847480) differ by less than 4E-4 (2.4938721023826105e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723389095883) differ by less than 4E-4 (2.493748653908945e-06) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -468,15 +468,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238464413054557E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.2861s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6550s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6311s for 90112 events => throughput is 1.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2869s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6459s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6410s for 90112 events => throughput is 1.41E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002) differ by less than 4E-4 (2.211270000440635e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464413054557E-002) differ by less than 4E-4 (2.2110043929046697e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436598e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.433747e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.456217e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.439231e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -511,15 +511,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112725654777677] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7694s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7686s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.65E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7679s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7669s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.64E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cuda (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) +OK! xsec from fortran (0.10112748607749111) and cuda (0.10112725654777677) differ by less than 4E-4 (2.269706518509551e-06) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -544,15 +544,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238470908598507E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0336s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0236s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 90112 events => throughput is 9.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0334s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0228s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0106s for 90112 events => throughput is 8.48E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cuda (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cuda (7.9238470908598507E-002) differ by less than 4E-4 (1.3912582552677577e-07) *** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.286741e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.169304e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.847775e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.543814e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.718457e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.538024e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.500599e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.620966e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.731481e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.534581e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.446798e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.755117e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.570459e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.399618e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.626329e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276628e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 96ad54f38a..0382386146 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:14:07 +DATE: 2024-06-03_19:17:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.7139s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3728s - [COUNTERS] Fortran MEs ( 1 ) : 0.3411s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3719s + [COUNTERS] Fortran MEs ( 1 ) : 0.3408s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,8 +83,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6813s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s + [COUNTERS] PROGRAM TOTAL : 0.6737s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3327s [COUNTERS] Fortran MEs ( 1 ) : 0.3410s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3605s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6049s - [COUNTERS] Fortran MEs ( 1 ) : 3.7556s for 90112 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3546s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6038s + [COUNTERS] Fortran MEs ( 1 ) : 3.7508s for 90112 events => throughput is 2.40E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 1.0389s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6816s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3573s for 8192 events => throughput is 2.29E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0436s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3605s for 8192 events => throughput is 2.27E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.8404s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9370s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.9034s for 90112 events => throughput is 2.31E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.9193s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9462s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.9731s for 90112 events => throughput is 2.27E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.375362e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.347572e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.376875e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.354672e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6879s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1785s for 8192 events => throughput is 4.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6965s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5128s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1837s for 8192 events => throughput is 4.46E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.8198s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7730s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.0469s for 90112 events => throughput is 4.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.8495s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7815s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.0680s for 90112 events => throughput is 4.36E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.714685e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.566169e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.706962e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.556033e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0904s for 8192 events => throughput is 9.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5284s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4359s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0925s for 8192 events => throughput is 8.85E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6914s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6873s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0040s for 90112 events => throughput is 8.98E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7188s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7021s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0167s for 90112 events => throughput is 8.86E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.227515e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.872567e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.179808e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.054172e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4935s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0814s for 8192 events => throughput is 1.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4975s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0820s for 8192 events => throughput is 9.99E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5536s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8848s for 90112 events => throughput is 1.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5873s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6819s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9054s for 90112 events => throughput is 9.95E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.050974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.020713e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.057681e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.428501e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5760s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4541s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1219s for 8192 events => throughput is 6.72E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5807s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4574s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1233s for 8192 events => throughput is 6.65E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.0588s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7176s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3412s for 90112 events => throughput is 6.72E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.0779s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7203s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3576s for 90112 events => throughput is 6.64E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.837164e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.732100e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.867816e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.720907e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7767s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7713s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7725s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,8 +546,8 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0439s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0209s + [COUNTERS] PROGRAM TOTAL : 2.0477s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0247s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 90112 events => throughput is 3.92E+06 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.640467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.628855e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.045816e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.119035e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.873716e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.920924e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232563e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.233981e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.885545e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.909258e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243680e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.244342e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.866976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.916143e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.723701e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.724074e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index c981799588..abf93aa0ee 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,21 +1,21 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:14:52 +DATE: 2024-06-03_19:18:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.6980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2611s - [COUNTERS] Fortran MEs ( 1 ) : 4.4369s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6956s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2643s + [COUNTERS] Fortran MEs ( 1 ) : 4.4312s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.6906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s - [COUNTERS] Fortran MEs ( 1 ) : 4.4335s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7080s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2582s + [COUNTERS] Fortran MEs ( 1 ) : 4.4498s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.6666s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8936s - [COUNTERS] Fortran MEs ( 1 ) : 48.7731s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.6378s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8890s + [COUNTERS] Fortran MEs ( 1 ) : 48.7488s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222236] fbridge_mode=1 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 9.2716s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7071s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5645s for 8192 events => throughput is 1.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.2997s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7114s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.5884s for 8192 events => throughput is 1.79E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index e5afb01bb6..6e511fc492 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -3,20 +3,20 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:16:03 +DATE: 2024-06-03_19:19:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.6224s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s - [COUNTERS] Fortran MEs ( 1 ) : 4.3645s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6976s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2597s + [COUNTERS] Fortran MEs ( 1 ) : 4.4379s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.6165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2540s - [COUNTERS] Fortran MEs ( 1 ) : 4.3625s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6944s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2559s + [COUNTERS] Fortran MEs ( 1 ) : 4.4384s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.5574s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8890s - [COUNTERS] Fortran MEs ( 1 ) : 48.6684s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.6976s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8928s + [COUNTERS] Fortran MEs ( 1 ) : 48.8049s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.4632 [0.46320716609204404] fbridge_mode=1 + [XSECTION] Cross section = 0.4632 [0.46320716615478996] fbridge_mode=1 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 8.9166s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5182s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3984s for 8192 events => throughput is 1.86E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.0183s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5713s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4470s for 8192 events => throughput is 1.84E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.46320556621222242) and cpp (0.46320716609204404) differ by less than 4E-4 (3.453930475627587e-06) +OK! xsec from fortran (0.46320556621222242) and cpp (0.46320716615478996) differ by less than 4E-4 (3.4540659359372228e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 05784aaa7b..3bc8869524 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -2,18 +2,18 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda - -make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:17:14 +DATE: 2024-06-03_19:20:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 187 events) - [COUNTERS] PROGRAM TOTAL : 4.6921s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2633s - [COUNTERS] Fortran MEs ( 1 ) : 4.4288s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6850s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2593s + [COUNTERS] Fortran MEs ( 1 ) : 4.4257s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 4.6873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2561s - [COUNTERS] Fortran MEs ( 1 ) : 4.4311s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6877s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2565s + [COUNTERS] Fortran MEs ( 1 ) : 4.4312s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 294 events) - [COUNTERS] PROGRAM TOTAL : 50.6939s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8954s - [COUNTERS] Fortran MEs ( 1 ) : 48.7985s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.6566s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8924s + [COUNTERS] Fortran MEs ( 1 ) : 48.7641s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.4632 [0.46320556893412546] fbridge_mode=1 [UNWEIGHT] Wrote 11 events (found 168 events) - [COUNTERS] PROGRAM TOTAL : 9.3537s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7437s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6100s for 8192 events => throughput is 1.78E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.3897s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7555s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6343s for 8192 events => throughput is 1.77E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index b9fb2f5206..0a010620d3 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - - make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -18,10 +18,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:19:55 +DATE: 2024-06-03_19:23:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 103.1254s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5184s - [COUNTERS] Fortran MEs ( 1 ) : 102.6070s for 8192 events => throughput is 7.98E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5123s + [COUNTERS] Fortran MEs ( 1 ) : 102.5945s for 8192 events => throughput is 7.98E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 103.1957s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5142s - [COUNTERS] Fortran MEs ( 1 ) : 102.6815s for 8192 events => throughput is 7.98E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1418s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5150s + [COUNTERS] Fortran MEs ( 1 ) : 102.6268s for 8192 events => throughput is 7.98E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1132.8870s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4650s - [COUNTERS] Fortran MEs ( 1 ) : 1128.4220s for 90112 events => throughput is 7.99E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1133.8082s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4636s + [COUNTERS] Fortran MEs ( 1 ) : 1129.3446s for 90112 events => throughput is 7.98E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 242.5730s - [COUNTERS] Fortran Overhead ( 0 ) : 110.5170s - [COUNTERS] CudaCpp MEs ( 2 ) : 132.0559s for 8192 events => throughput is 6.20E+01 events/s + [COUNTERS] PROGRAM TOTAL : 244.7419s + [COUNTERS] Fortran Overhead ( 0 ) : 111.2652s + [COUNTERS] CudaCpp MEs ( 2 ) : 133.4766s for 8192 events => throughput is 6.14E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1558.9829s - [COUNTERS] Fortran Overhead ( 0 ) : 114.2344s - [COUNTERS] CudaCpp MEs ( 2 ) : 1444.7485s for 90112 events => throughput is 6.24E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1582.9709s + [COUNTERS] Fortran Overhead ( 0 ) : 113.0776s + [COUNTERS] CudaCpp MEs ( 2 ) : 1469.8933s for 90112 events => throughput is 6.13E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.436157e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.961551e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.453513e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.034483e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 115.6810s - [COUNTERS] Fortran Overhead ( 0 ) : 53.3874s - [COUNTERS] CudaCpp MEs ( 2 ) : 62.2936s for 8192 events => throughput is 1.32E+02 events/s + [COUNTERS] PROGRAM TOTAL : 118.5489s + [COUNTERS] Fortran Overhead ( 0 ) : 54.0240s + [COUNTERS] CudaCpp MEs ( 2 ) : 64.5249s for 8192 events => throughput is 1.27E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 738.8265s - [COUNTERS] Fortran Overhead ( 0 ) : 57.1821s - [COUNTERS] CudaCpp MEs ( 2 ) : 681.6444s for 90112 events => throughput is 1.32E+02 events/s + [COUNTERS] PROGRAM TOTAL : 768.8419s + [COUNTERS] Fortran Overhead ( 0 ) : 57.8845s + [COUNTERS] CudaCpp MEs ( 2 ) : 710.9575s for 90112 events => throughput is 1.27E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.557219e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.540227e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.557436e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531639e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.2051s - [COUNTERS] Fortran Overhead ( 0 ) : 24.9796s - [COUNTERS] CudaCpp MEs ( 2 ) : 29.2256s for 8192 events => throughput is 2.80E+02 events/s + [COUNTERS] PROGRAM TOTAL : 56.3786s + [COUNTERS] Fortran Overhead ( 0 ) : 25.2558s + [COUNTERS] CudaCpp MEs ( 2 ) : 31.1228s for 8192 events => throughput is 2.63E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 350.6938s - [COUNTERS] Fortran Overhead ( 0 ) : 29.4308s - [COUNTERS] CudaCpp MEs ( 2 ) : 321.2629s for 90112 events => throughput is 2.80E+02 events/s + [COUNTERS] PROGRAM TOTAL : 364.9544s + [COUNTERS] Fortran Overhead ( 0 ) : 29.3437s + [COUNTERS] CudaCpp MEs ( 2 ) : 335.6107s for 90112 events => throughput is 2.69E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.356831e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.314788e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.330655e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.309326e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 49.9010s - [COUNTERS] Fortran Overhead ( 0 ) : 22.3111s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.5899s for 8192 events => throughput is 2.97E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.0198s + [COUNTERS] Fortran Overhead ( 0 ) : 22.5357s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.4841s for 8192 events => throughput is 3.09E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 317.7874s - [COUNTERS] Fortran Overhead ( 0 ) : 26.2827s - [COUNTERS] CudaCpp MEs ( 2 ) : 291.5048s for 90112 events => throughput is 3.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 318.6826s + [COUNTERS] Fortran Overhead ( 0 ) : 26.5677s + [COUNTERS] CudaCpp MEs ( 2 ) : 292.1149s for 90112 events => throughput is 3.08E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.890746e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.747560e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.795517e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.748803e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 51.3811s - [COUNTERS] Fortran Overhead ( 0 ) : 25.1741s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2071s for 8192 events => throughput is 3.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 51.6764s + [COUNTERS] Fortran Overhead ( 0 ) : 25.3722s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.3043s for 8192 events => throughput is 3.11E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 320.5201s - [COUNTERS] Fortran Overhead ( 0 ) : 29.2606s - [COUNTERS] CudaCpp MEs ( 2 ) : 291.2595s for 90112 events => throughput is 3.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 321.9302s + [COUNTERS] Fortran Overhead ( 0 ) : 29.4944s + [COUNTERS] CudaCpp MEs ( 2 ) : 292.4359s for 90112 events => throughput is 3.08E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.335685e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299273e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.330505e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.311486e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 4.2720s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1877s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0843s for 8192 events => throughput is 7.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2814s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1978s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0836s for 8192 events => throughput is 7.56E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 19.1951s - [COUNTERS] Fortran Overhead ( 0 ) : 7.2572s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9379s for 90112 events => throughput is 7.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 19.1278s + [COUNTERS] Fortran Overhead ( 0 ) : 7.2063s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9215s for 90112 events => throughput is 7.56E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.550389e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.520166e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.286009e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.318224e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.250646e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.228195e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.571508e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.554291e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.277996e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.213333e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.464805e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.436994e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.271016e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.231371e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.239190e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.233808e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 6600eb2c20..f07d69ad27 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -20,9 +20,9 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_23:54:30 +DATE: 2024-06-03_20:58:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 103.1481s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5128s - [COUNTERS] Fortran MEs ( 1 ) : 102.6352s for 8192 events => throughput is 7.98E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1273s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5122s + [COUNTERS] Fortran MEs ( 1 ) : 102.6151s for 8192 events => throughput is 7.98E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 103.0623s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5146s - [COUNTERS] Fortran MEs ( 1 ) : 102.5477s for 8192 events => throughput is 7.99E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.5195s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5175s + [COUNTERS] Fortran MEs ( 1 ) : 103.0020s for 8192 events => throughput is 7.95E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1133.8589s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4674s - [COUNTERS] Fortran MEs ( 1 ) : 1129.3915s for 90112 events => throughput is 7.98E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1132.6620s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4718s + [COUNTERS] Fortran MEs ( 1 ) : 1128.1902s for 90112 events => throughput is 7.99E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719945779552E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 211.4040s - [COUNTERS] Fortran Overhead ( 0 ) : 97.0025s - [COUNTERS] CudaCpp MEs ( 2 ) : 114.4015s for 8192 events => throughput is 7.16E+01 events/s + [COUNTERS] PROGRAM TOTAL : 218.5597s + [COUNTERS] Fortran Overhead ( 0 ) : 99.9328s + [COUNTERS] CudaCpp MEs ( 2 ) : 118.6268s for 8192 events => throughput is 6.91E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719945779552E-006) differ by less than 4E-4 (0.00013985165319851944) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -166,15 +166,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326290777570335E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1358.5973s - [COUNTERS] Fortran Overhead ( 0 ) : 101.0487s - [COUNTERS] CudaCpp MEs ( 2 ) : 1257.5486s for 90112 events => throughput is 7.17E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1403.2262s + [COUNTERS] Fortran Overhead ( 0 ) : 103.7776s + [COUNTERS] CudaCpp MEs ( 2 ) : 1299.4486s for 90112 events => throughput is 6.93E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290777570335E-007) differ by less than 4E-4 (0.00014139226908471692) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -183,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.461850e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.271213e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.478420e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.207492e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,15 +210,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716994349971E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 53.3849s - [COUNTERS] Fortran Overhead ( 0 ) : 25.1661s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2188s for 8192 events => throughput is 2.90E+02 events/s + [COUNTERS] PROGRAM TOTAL : 53.3585s + [COUNTERS] Fortran Overhead ( 0 ) : 25.2480s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.1105s for 8192 events => throughput is 2.91E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405717007921116E-006) differ by less than 4E-4 (0.00013961480525170877) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716994349971E-006) differ by less than 4E-4 (0.00013961371115600585) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -244,15 +244,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326284885505778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 339.1541s - [COUNTERS] Fortran Overhead ( 0 ) : 29.1542s - [COUNTERS] CudaCpp MEs ( 2 ) : 309.9999s for 90112 events => throughput is 2.91E+02 events/s + [COUNTERS] PROGRAM TOTAL : 337.7243s + [COUNTERS] Fortran Overhead ( 0 ) : 29.3919s + [COUNTERS] CudaCpp MEs ( 2 ) : 308.3325s for 90112 events => throughput is 2.92E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007) differ by less than 4E-4 (0.00014114029707035236) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284885505778E-007) differ by less than 4E-4 (0.0001411396400787801) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -261,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.372421e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319456e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.331584e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333192e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -288,15 +288,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716646933743E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 27.8063s - [COUNTERS] Fortran Overhead ( 0 ) : 12.9789s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.8274s for 8192 events => throughput is 5.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 27.7568s + [COUNTERS] Fortran Overhead ( 0 ) : 13.0050s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.7517s for 8192 events => throughput is 5.55E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716646933743E-006) differ by less than 4E-4 (0.00013958570271999093) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -322,15 +322,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326277033163402E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 179.6098s - [COUNTERS] Fortran Overhead ( 0 ) : 16.9911s - [COUNTERS] CudaCpp MEs ( 2 ) : 162.6187s for 90112 events => throughput is 5.54E+02 events/s + [COUNTERS] PROGRAM TOTAL : 182.5340s + [COUNTERS] Fortran Overhead ( 0 ) : 16.9671s + [COUNTERS] CudaCpp MEs ( 2 ) : 165.5669s for 90112 events => throughput is 5.44E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277033163402E-007) differ by less than 4E-4 (0.00014080296191987252) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -339,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.562410e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.451804e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.526748e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.359616e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -366,15 +366,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716646933743E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 24.3059s - [COUNTERS] Fortran Overhead ( 0 ) : 11.3922s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.9137s for 8192 events => throughput is 6.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.3577s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8276s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.5301s for 8192 events => throughput is 6.05E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716646933743E-006) differ by less than 4E-4 (0.00013958570271999093) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -400,15 +400,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326277033163402E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 157.4560s - [COUNTERS] Fortran Overhead ( 0 ) : 15.2596s - [COUNTERS] CudaCpp MEs ( 2 ) : 142.1964s for 90112 events => throughput is 6.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 162.5130s + [COUNTERS] Fortran Overhead ( 0 ) : 15.7082s + [COUNTERS] CudaCpp MEs ( 2 ) : 146.8048s for 90112 events => throughput is 6.14E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277033163402E-007) differ by less than 4E-4 (0.00014080296191987252) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -417,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.624295e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.494357e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.660814e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.552945e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -444,15 +444,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719257109645E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 25.8895s - [COUNTERS] Fortran Overhead ( 0 ) : 12.8401s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.0494s for 8192 events => throughput is 6.28E+02 events/s + [COUNTERS] PROGRAM TOTAL : 26.1335s + [COUNTERS] Fortran Overhead ( 0 ) : 12.8977s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.2357s for 8192 events => throughput is 6.19E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719306052570E-006) differ by less than 4E-4 (0.00013980007888836354) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719257109645E-006) differ by less than 4E-4 (0.00013979613314640815) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -478,15 +478,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326283665697276E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 161.6641s - [COUNTERS] Fortran Overhead ( 0 ) : 16.8758s - [COUNTERS] CudaCpp MEs ( 2 ) : 144.7883s for 90112 events => throughput is 6.22E+02 events/s + [COUNTERS] PROGRAM TOTAL : 164.1651s + [COUNTERS] Fortran Overhead ( 0 ) : 16.9607s + [COUNTERS] CudaCpp MEs ( 2 ) : 147.2044s for 90112 events => throughput is 6.12E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007) differ by less than 4E-4 (0.00014108709892313165) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283665697276E-007) differ by less than 4E-4 (0.00014108733939433016) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -495,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.696273e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.616520e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.656469e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.662858e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -521,15 +521,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405721007137020E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 2.5420s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0407s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5013s for 8192 events => throughput is 1.63E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6523s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5336s for 8192 events => throughput is 1.54E+04 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cuda (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) +OK! xsec from fortran (1.2403985227939174E-006) and cuda (1.2405721007137020E-006) differ by less than 4E-4 (0.00013993721904270728) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -554,15 +554,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326295421688232E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 11.4677s - [COUNTERS] Fortran Overhead ( 0 ) : 6.0517s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4160s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 11.8998s + [COUNTERS] Fortran Overhead ( 0 ) : 6.0681s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8317s for 90112 events => throughput is 1.55E+04 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cuda (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) +OK! xsec from fortran (2.3322993086655967E-007) and cuda (2.3326295421688232E-007) differ by less than 4E-4 (0.00014159139095037965) *** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -571,42 +571,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.632797e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.542837e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.633172e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.553403e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.316402e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.151076e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.353428e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.169447e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.297312e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.116112e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.399287e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.192928e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.336831e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.145677e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.375180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.977159e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 91cbe4e948..9c9b469270 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -3,8 +3,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_01:05:03 +DATE: 2024-06-03_22:10:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 103.0692s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5126s - [COUNTERS] Fortran MEs ( 1 ) : 102.5566s for 8192 events => throughput is 7.99E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5127s + [COUNTERS] Fortran MEs ( 1 ) : 102.6643s for 8192 events => throughput is 7.98E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 103.0859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5173s - [COUNTERS] Fortran MEs ( 1 ) : 102.5686s for 8192 events => throughput is 7.99E+01 events/s + [COUNTERS] PROGRAM TOTAL : 103.1001s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5105s + [COUNTERS] Fortran MEs ( 1 ) : 102.5896s for 8192 events => throughput is 7.99E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1132.5916s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4570s - [COUNTERS] Fortran MEs ( 1 ) : 1128.1345s for 90112 events => throughput is 7.99E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1133.0475s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4960s + [COUNTERS] Fortran MEs ( 1 ) : 1128.5515s for 90112 events => throughput is 7.98E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 237.2700s - [COUNTERS] Fortran Overhead ( 0 ) : 112.2091s - [COUNTERS] CudaCpp MEs ( 2 ) : 125.0609s for 8192 events => throughput is 6.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 240.6007s + [COUNTERS] Fortran Overhead ( 0 ) : 111.5910s + [COUNTERS] CudaCpp MEs ( 2 ) : 129.0097s for 8192 events => throughput is 6.35E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1488.3180s - [COUNTERS] Fortran Overhead ( 0 ) : 107.9254s - [COUNTERS] CudaCpp MEs ( 2 ) : 1380.3926s for 90112 events => throughput is 6.53E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1543.7742s + [COUNTERS] Fortran Overhead ( 0 ) : 115.8104s + [COUNTERS] CudaCpp MEs ( 2 ) : 1427.9637s for 90112 events => throughput is 6.31E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.475500e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.367512e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.435461e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.359410e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 119.9634s - [COUNTERS] Fortran Overhead ( 0 ) : 54.9402s - [COUNTERS] CudaCpp MEs ( 2 ) : 65.0232s for 8192 events => throughput is 1.26E+02 events/s + [COUNTERS] PROGRAM TOTAL : 121.0844s + [COUNTERS] Fortran Overhead ( 0 ) : 55.3679s + [COUNTERS] CudaCpp MEs ( 2 ) : 65.7165s for 8192 events => throughput is 1.25E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 780.5801s - [COUNTERS] Fortran Overhead ( 0 ) : 59.5507s - [COUNTERS] CudaCpp MEs ( 2 ) : 721.0294s for 90112 events => throughput is 1.25E+02 events/s + [COUNTERS] PROGRAM TOTAL : 790.9699s + [COUNTERS] Fortran Overhead ( 0 ) : 59.0844s + [COUNTERS] CudaCpp MEs ( 2 ) : 731.8855s for 90112 events => throughput is 1.23E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.491581e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.498457e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.498726e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504018e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 52.6745s - [COUNTERS] Fortran Overhead ( 0 ) : 24.2901s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.3844s for 8192 events => throughput is 2.89E+02 events/s + [COUNTERS] PROGRAM TOTAL : 53.5710s + [COUNTERS] Fortran Overhead ( 0 ) : 24.2188s + [COUNTERS] CudaCpp MEs ( 2 ) : 29.3522s for 8192 events => throughput is 2.79E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 338.8982s - [COUNTERS] Fortran Overhead ( 0 ) : 28.2101s - [COUNTERS] CudaCpp MEs ( 2 ) : 310.6881s for 90112 events => throughput is 2.90E+02 events/s + [COUNTERS] PROGRAM TOTAL : 350.5467s + [COUNTERS] Fortran Overhead ( 0 ) : 28.1483s + [COUNTERS] CudaCpp MEs ( 2 ) : 322.3984s for 90112 events => throughput is 2.80E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.451072e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.486418e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.475095e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.471290e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 46.3456s - [COUNTERS] Fortran Overhead ( 0 ) : 21.3056s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.0400s for 8192 events => throughput is 3.27E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.9144s + [COUNTERS] Fortran Overhead ( 0 ) : 21.0704s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8440s for 8192 events => throughput is 3.30E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 298.0662s - [COUNTERS] Fortran Overhead ( 0 ) : 24.8292s - [COUNTERS] CudaCpp MEs ( 2 ) : 273.2370s for 90112 events => throughput is 3.30E+02 events/s + [COUNTERS] PROGRAM TOTAL : 297.8655s + [COUNTERS] Fortran Overhead ( 0 ) : 25.1053s + [COUNTERS] CudaCpp MEs ( 2 ) : 272.7602s for 90112 events => throughput is 3.30E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.043947e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.001594e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.044016e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.997185e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 50.3351s - [COUNTERS] Fortran Overhead ( 0 ) : 24.5435s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.7916s for 8192 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.9021s + [COUNTERS] Fortran Overhead ( 0 ) : 24.7015s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2007s for 8192 events => throughput is 3.13E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 314.8535s - [COUNTERS] Fortran Overhead ( 0 ) : 28.5171s - [COUNTERS] CudaCpp MEs ( 2 ) : 286.3364s for 90112 events => throughput is 3.15E+02 events/s + [COUNTERS] PROGRAM TOTAL : 314.7459s + [COUNTERS] Fortran Overhead ( 0 ) : 28.5760s + [COUNTERS] CudaCpp MEs ( 2 ) : 286.1699s for 90112 events => throughput is 3.15E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.418688e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.427026e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.414882e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.400989e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 3.6355s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8603s for 8192 events => throughput is 9.52E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6159s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8637s for 8192 events => throughput is 9.49E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 16.2537s - [COUNTERS] Fortran Overhead ( 0 ) : 6.7785s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4752s for 90112 events => throughput is 9.51E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.2754s + [COUNTERS] Fortran Overhead ( 0 ) : 6.7699s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5054s for 90112 events => throughput is 9.48E+03 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.440992e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.387299e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.083652e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.078785e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110058e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.106946e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.159066e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.157033e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113035e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112313e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111683e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115590e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.106360e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109459e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.643131e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.637429e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 9ef918bb40..c2ca23aed4 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -2,12 +2,13 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' @@ -16,7 +17,6 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:18:26 +DATE: 2024-06-03_19:21:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4837s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4091s - [COUNTERS] Fortran MEs ( 1 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4861s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4113s + [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,8 +83,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4137s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3388s + [COUNTERS] PROGRAM TOTAL : 0.4153s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s [COUNTERS] Fortran MEs ( 1 ) : 0.0749s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4333s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6135s - [COUNTERS] Fortran MEs ( 1 ) : 0.8198s for 90112 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4407s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6205s + [COUNTERS] Fortran MEs ( 1 ) : 0.8203s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4982s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4179s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0803s for 8192 events => throughput is 1.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4977s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0795s for 8192 events => throughput is 1.03E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -164,15 +164,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5615s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6764s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8851s for 90112 events => throughput is 1.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5507s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8799s for 90112 events => throughput is 1.02E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.031593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051793e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.032655e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051469e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -207,15 +207,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351262541] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4288s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0442s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4274s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262530) differ by less than 3E-14 (2.9531932455029164e-14) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262541) differ by less than 3E-14 (2.90878432451791e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.1183s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6354s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4829s for 90112 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1224s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6359s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4864s for 90112 events => throughput is 1.85E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.899679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.885005e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.911483e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894578e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3916s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3660s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3925s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3666s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0259s for 8192 events => throughput is 3.16E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9014s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6204s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2810s for 90112 events => throughput is 3.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9029s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6186s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2843s for 90112 events => throughput is 3.17E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.204350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.213667e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.215560e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.221920e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3630s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3863s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3633s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8699s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6164s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2535s for 90112 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6194s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2565s for 90112 events => throughput is 3.51E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.639452e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.525960e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.651673e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594108e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4108s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3757s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 8192 events => throughput is 2.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4117s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3763s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0354s for 8192 events => throughput is 2.31E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0218s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6331s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3887s for 90112 events => throughput is 2.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0153s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6261s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3892s for 90112 events => throughput is 2.32E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.346690e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.343890e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.362776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.358773e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7749s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7797s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7790s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0315s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0234s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 90112 events => throughput is 1.11E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0347s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0266s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0080s for 90112 events => throughput is 1.12E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.544378e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.615457e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.084339e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.994909e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.545581e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.583205e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.516553e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.543893e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.562082e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.584945e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.797868e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.793558e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.544309e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.582773e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.786814e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.778670e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 61b60dbfe4..3ccecc8b9e 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,13 +1,13 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu - make USEBUILDDIR=1 BACKEND=cuda + + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 - make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:18:56 +DATE: 2024-06-03_19:22:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4838s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s - [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4105s + [COUNTERS] Fortran MEs ( 1 ) : 0.0750s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4164s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3413s - [COUNTERS] Fortran MEs ( 1 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3401s + [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4404s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6194s - [COUNTERS] Fortran MEs ( 1 ) : 0.8210s for 90112 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4317s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6124s + [COUNTERS] Fortran MEs ( 1 ) : 0.8193s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110463158198617] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4145s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0761s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0750s for 8192 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110463093540638) differ by less than 4E-4 (2.812844174915341e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110463158198617) differ by less than 4E-4 (2.8104591991429118e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -164,15 +164,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686347932190] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5157s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6736s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8421s for 90112 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4925s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6671s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8254s for 90112 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ by less than 4E-4 (1.3172298474195543e-08) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686347932190) differ by less than 4E-4 (9.698858494111562e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.092654e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110666e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.096977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112318e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -207,15 +207,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110459183868807] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3952s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3682s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 8192 events => throughput is 3.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3965s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3692s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 8192 events => throughput is 3.00E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110459152958460) differ by less than 4E-4 (2.9581965829139634e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110459183868807) differ by less than 4E-4 (2.9570564231695684e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -240,15 +240,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510683073685827] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9207s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6222s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2985s for 90112 events => throughput is 3.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9156s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6148s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3007s for 90112 events => throughput is 3.00E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ by less than 4E-4 (1.6458771667782202e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683073685827) differ by less than 4E-4 (1.6191372875784538e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.030351e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.971444e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.053000e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.984942e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -283,15 +283,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110460727141733] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3664s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3526s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3663s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3527s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.04E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460727141733) differ by less than 4E-4 (2.9001312211729413e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -316,15 +316,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510682516942223] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7549s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6032s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1517s for 90112 events => throughput is 5.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7562s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6051s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1511s for 90112 events => throughput is 5.96E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682516942223) differ by less than 4E-4 (1.8779591537398943e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.050114e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.945647e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.136523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.945682e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -359,15 +359,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110460727141733] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3642s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0127s for 8192 events => throughput is 6.47E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460727141733) differ by less than 4E-4 (2.9001312211729413e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -392,15 +392,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510682516942223] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7428s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6036s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1392s for 90112 events => throughput is 6.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7393s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5994s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1398s for 90112 events => throughput is 6.44E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682516942223) differ by less than 4E-4 (1.8779591537398943e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.414981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.484673e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.641906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.451127e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110464220032526] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3768s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3587s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0181s for 8192 events => throughput is 4.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3743s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3563s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0180s for 8192 events => throughput is 4.55E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464176080312) differ by less than 4E-4 (2.772913590631809e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464220032526) differ by less than 4E-4 (2.771292368253242e-06) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -470,15 +470,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510685471570221] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8044s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6082s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1962s for 90112 events => throughput is 4.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8139s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6154s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1986s for 90112 events => throughput is 4.54E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ by less than 4E-4 (5.3231167917999755e-08) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685471570221) differ by less than 4E-4 (5.043963013928732e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -487,12 +487,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.700097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.696129e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.791964e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.786616e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,15 +513,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110477321990667] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7736s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7730s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.53E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7740s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7734s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.47E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cuda (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) +OK! xsec from fortran (0.27110539351263330) and cuda (0.27110477321990667) differ by less than 4E-4 (2.2880132283242816e-06) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -546,15 +546,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510689318513457] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0277s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0213s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0381s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0315s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cuda (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) +OK! xsec from fortran (0.21510686556561295) and cuda (0.21510689318513457) differ by less than 4E-4 (1.2839907048700638e-07) *** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -563,42 +563,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.752984e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.519205e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.407709e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.044863e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.072830e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.851877e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.713385e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.463513e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.043933e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.826046e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.812134e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.506498e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.573075e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.403095e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.978342e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.350208e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 077dc6a885..d1ba5b1197 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu - - make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,8 +13,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-02_22:19:25 +DATE: 2024-06-03_19:22:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4856s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4106s - [COUNTERS] Fortran MEs ( 1 ) : 0.0750s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4860s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4111s + [COUNTERS] Fortran MEs ( 1 ) : 0.0749s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4160s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3409s - [COUNTERS] Fortran MEs ( 1 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4152s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s + [COUNTERS] Fortran MEs ( 1 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4475s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6257s - [COUNTERS] Fortran MEs ( 1 ) : 0.8218s for 90112 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4393s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6188s + [COUNTERS] Fortran MEs ( 1 ) : 0.8204s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,15 +131,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539350666329] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.5015s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4206s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0809s for 8192 events => throughput is 1.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4978s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0795s for 8192 events => throughput is 1.03E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539350666329) differ by less than 2E-4 (2.2020940626532592e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -164,15 +164,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560103207] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5666s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6744s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8923s for 90112 events => throughput is 1.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5505s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6724s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8781s for 90112 events => throughput is 1.03E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ by less than 2E-4 (1.967879192932287e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560103207) differ by less than 2E-4 (1.646582870051816e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.031508e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.044856e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.030453e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050093e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -207,15 +207,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539350666335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4283s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4261s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3828s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539350666335) differ by less than 2E-4 (2.2020718581927667e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -240,15 +240,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560103204] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.1044s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6391s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4653s for 90112 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1129s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6349s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4780s for 90112 events => throughput is 1.89E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ by less than 2E-4 (1.9678769724862377e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560103204) differ by less than 2E-4 (1.6465806496057667e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.916584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893360e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.929453e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.897592e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -283,15 +283,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539330887440] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3913s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3659s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0254s for 8192 events => throughput is 3.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3916s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3664s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330887440) differ by less than 2E-4 (7.515855715567454e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -316,15 +316,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686557693198] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8986s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6180s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2806s for 90112 events => throughput is 3.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8943s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6150s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2793s for 90112 events => throughput is 3.23E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686557693198) differ by less than 2E-4 (5.262057456434377e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.276659e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279549e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.243257e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.310753e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -359,15 +359,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539330887440] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3866s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3640s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3861s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3633s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 8192 events => throughput is 3.59E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330887440) differ by less than 2E-4 (7.515855715567454e-10) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -392,15 +392,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686557693198] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8612s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6158s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2454s for 90112 events => throughput is 3.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6258s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2502s for 90112 events => throughput is 3.60E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686557693198) differ by less than 2E-4 (5.262057456434377e-11) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.762930e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.690436e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.712469e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.609255e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -435,15 +435,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539330887440] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4156s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0362s for 8192 events => throughput is 2.26E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4109s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3747s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.26E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330887440) differ by less than 2E-4 (7.515855715567454e-10) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -468,15 +468,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686557693198] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0311s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4005s for 90112 events => throughput is 2.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0171s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6196s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3975s for 90112 events => throughput is 2.27E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686557693198) differ by less than 2E-4 (5.262057456434377e-11) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.283401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.303007e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.295262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.295044e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,8 +513,8 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7745s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7738s + [COUNTERS] PROGRAM TOTAL : 0.7746s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7739s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0305s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0225s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0080s for 90112 events => throughput is 1.12E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0203s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0124s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0079s for 90112 events => throughput is 1.14E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.542291e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.610839e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.061498e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.080962e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.547161e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.562738e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.524448e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.542804e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.546105e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577824e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.795446e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.836954e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.543201e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.587620e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.784959e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.784773e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 26ba9b7ba1..abbc397291 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -3,10 +3,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/h make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:38:17 +DATE: 2024-06-03_23:45:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,8 +58,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9057s + [COUNTERS] PROGRAM TOTAL : 0.9534s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9038s [COUNTERS] Fortran MEs ( 1 ) : 0.0496s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3760s - [COUNTERS] Fortran MEs ( 1 ) : 0.0496s for 8192 events => throughput is 1.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4252s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3755s + [COUNTERS] Fortran MEs ( 1 ) : 0.0497s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8801s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3361s - [COUNTERS] Fortran MEs ( 1 ) : 0.5440s for 90112 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8842s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3398s + [COUNTERS] Fortran MEs ( 1 ) : 0.5444s for 90112 events => throughput is 1.66E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256148] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4617s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4322s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0521s for 8192 events => throughput is 1.57E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8500s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3666s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4834s for 90112 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9475s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3732s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5743s for 90112 events => throughput is 1.57E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.915502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.625932e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.921168e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.626172e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -207,15 +207,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4235s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3989s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4316s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4033s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0283s for 8192 events => throughput is 2.90E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256161) differ by less than 3E-14 (0.0) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256152) differ by less than 3E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.6267s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3509s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2758s for 90112 events => throughput is 3.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6688s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3141s for 90112 events => throughput is 2.87E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.381075e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.945292e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.398450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966051e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -283,15 +283,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955499256232] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4040s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4083s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3911s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256152) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256232) differ by less than 3E-14 (3.552713678800501e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -316,15 +316,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895240377489] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.5233s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3587s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1646s for 90112 events => throughput is 5.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5312s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3409s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1902s for 90112 events => throughput is 4.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377560) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377489) differ by less than 3E-14 (3.885780586188048e-15) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.463798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.728526e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.554107e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.749171e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -359,15 +359,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955499256232] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4072s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3936s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4060s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3902s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0158s for 8192 events => throughput is 5.17E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256152) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256232) differ by less than 3E-14 (3.552713678800501e-15) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -392,15 +392,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895240377489] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.4890s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3389s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1501s for 90112 events => throughput is 6.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5235s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3489s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1746s for 90112 events => throughput is 5.16E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377560) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377489) differ by less than 3E-14 (3.885780586188048e-15) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.050885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.263388e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.130883e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.235510e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4200s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3973s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.60E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4026s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0240s for 8192 events => throughput is 3.41E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.6056s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3528s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2528s for 90112 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6177s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3535s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2642s for 90112 events => throughput is 3.41E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.666281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.466103e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.687428e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496542e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256165] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8148s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8142s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.40E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8128s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8122s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377573] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.7684s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7616s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 90112 events => throughput is 1.32E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7737s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7669s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 90112 events => throughput is 1.32E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.987095e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.020955e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.307521e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.349598e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.131525e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.245064e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.834387e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.794002e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.134822e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.241343e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.047145e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049564e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.124622e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.262918e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.749909e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.751190e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index c173d3145a..c4d025695a 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,13 +1,13 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx - make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:38:44 +DATE: 2024-06-03_23:45:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9069s - [COUNTERS] Fortran MEs ( 1 ) : 0.0498s for 8192 events => throughput is 1.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9463s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8967s + [COUNTERS] Fortran MEs ( 1 ) : 0.0496s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,8 +83,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4239s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3743s + [COUNTERS] PROGRAM TOTAL : 0.4218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s [COUNTERS] Fortran MEs ( 1 ) : 0.0496s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8822s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3377s - [COUNTERS] Fortran MEs ( 1 ) : 0.5445s for 90112 events => throughput is 1.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8730s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3311s + [COUNTERS] Fortran MEs ( 1 ) : 0.5419s for 90112 events => throughput is 1.66E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162897371946169] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162897355760356] fbridge_mode=1 [UNWEIGHT] Wrote 1620 events (found 1625 events) - [COUNTERS] PROGRAM TOTAL : 0.4570s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4150s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0420s for 8192 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4713s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4229s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0484s for 8192 events => throughput is 1.69E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256161) and cpp (2.0162897371946169) differ by less than 4E-4 (2.8828764708777044e-06) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162897355760356) differ by less than 4E-4 (2.8836792208553064e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index bf03415f4c..6343a38ff0 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx - make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone + + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:38:50 +DATE: 2024-06-03_23:45:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9525s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9028s - [COUNTERS] Fortran MEs ( 1 ) : 0.0497s for 8192 events => throughput is 1.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9024s + [COUNTERS] Fortran MEs ( 1 ) : 0.0498s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4237s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3740s - [COUNTERS] Fortran MEs ( 1 ) : 0.0497s for 8192 events => throughput is 1.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3757s + [COUNTERS] Fortran MEs ( 1 ) : 0.0500s for 8192 events => throughput is 1.64E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8805s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3361s - [COUNTERS] Fortran MEs ( 1 ) : 0.5444s for 90112 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8825s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3366s + [COUNTERS] Fortran MEs ( 1 ) : 0.5459s for 90112 events => throughput is 1.65E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955975930954] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4637s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4196s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4783s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4263s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0520s for 8192 events => throughput is 1.58E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,9 +168,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895706383660] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.8548s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3687s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4861s for 90112 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9468s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3741s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5727s for 90112 events => throughput is 1.57E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -184,13 +184,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.787574e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.524718e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.794973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.526974e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -214,9 +214,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955975930958] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4230s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3983s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 8192 events => throughput is 3.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4305s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4020s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0285s for 8192 events => throughput is 2.88E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -248,9 +248,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895706383669] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.6212s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3479s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2733s for 90112 events => throughput is 3.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6727s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3582s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3145s for 90112 events => throughput is 2.87E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -264,13 +264,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.207786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.799476e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.237136e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.808005e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -292,15 +292,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955953696393] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4046s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3894s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0152s for 8192 events => throughput is 5.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4100s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.65E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955953691082) differ by less than 2E-4 (2.253811048902321e-08) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955953696393) differ by less than 2E-4 (2.2538374055969257e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -326,15 +326,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895701245432] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.5132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3424s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1708s for 90112 events => throughput is 5.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5426s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3446s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1979s for 90112 events => throughput is 4.55E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895701243878) differ by less than 2E-4 (2.255290776354002e-08) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895701245432) differ by less than 2E-4 (2.255298392483951e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -344,13 +344,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.675666e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.516759e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.789339e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.447333e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -372,15 +372,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955953696393] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4077s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3930s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3908s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 8192 events => throughput is 5.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955953691082) differ by less than 2E-4 (2.253811048902321e-08) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955953696393) differ by less than 2E-4 (2.2538374055969257e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -406,15 +406,15 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895701245432] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.4951s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3399s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1552s for 90112 events => throughput is 5.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5232s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3443s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1789s for 90112 events => throughput is 5.04E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895701243878) differ by less than 2E-4 (2.255290776354002e-08) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895701245432) differ by less than 2E-4 (2.255298392483951e-08) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -424,13 +424,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.160618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.027941e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.006884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.023815e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -454,9 +454,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4228s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4038s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.27E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -488,9 +488,9 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.6127s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3526s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2600s for 90112 events => throughput is 3.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3505s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2762s for 90112 events => throughput is 3.26E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -504,13 +504,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.310020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.115986e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.332375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.115887e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -533,9 +533,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0162955503257827] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8117s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8112s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.44E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8112s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8106s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.40E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -566,9 +566,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.043 [2.0434895242795732] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 1.7688s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7618s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7706s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7638s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 90112 events => throughput is 1.33E+07 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -581,42 +581,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.921899e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934115e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.241102e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.295137e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.122777e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.242460e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.754404e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.690931e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.139288e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.233099e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.037609e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.042411e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.121155e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.248320e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.701139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.726143e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index fa651276a5..4621353c13 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - make USEBUILDDIR=1 BACKEND=cuda + + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -22,8 +22,8 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:50 +DATE: 2024-06-03_23:47:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 2.7116s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3448s - [COUNTERS] Fortran MEs ( 1 ) : 2.3668s for 8192 events => throughput is 3.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7054s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3440s + [COUNTERS] Fortran MEs ( 1 ) : 2.3614s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.7131s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s - [COUNTERS] Fortran MEs ( 1 ) : 2.3711s for 8192 events => throughput is 3.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6939s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3387s + [COUNTERS] Fortran MEs ( 1 ) : 2.3552s for 8192 events => throughput is 3.48E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > / [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 27.8163s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8317s - [COUNTERS] Fortran MEs ( 1 ) : 25.9846s for 90112 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.8531s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8283s + [COUNTERS] Fortran MEs ( 1 ) : 26.0248s for 90112 events => throughput is 3.46E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 5.3378s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7898s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5480s for 8192 events => throughput is 3.22E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.3299s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7884s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5414s for 8192 events => throughput is 3.22E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438187E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 32.4420s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2681s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.1739s for 90112 events => throughput is 3.20E+03 events/s + [COUNTERS] PROGRAM TOTAL : 32.3003s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2656s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.0347s for 90112 events => throughput is 3.21E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.365920e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367889e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.364548e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.359695e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084412E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.9775s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6436s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3339s for 8192 events => throughput is 6.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.9921s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6518s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3403s for 8192 events => throughput is 6.11E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 17.7362s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1158s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.6204s for 90112 events => throughput is 6.16E+03 events/s + [COUNTERS] PROGRAM TOTAL : 17.9427s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1233s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.8194s for 90112 events => throughput is 6.08E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.420385e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.338176e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.404299e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.331780e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.5181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5898s for 8192 events => throughput is 1.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.5232s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9271s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5961s for 8192 events => throughput is 1.37E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 8.8864s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3960s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.4904s for 90112 events => throughput is 1.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.9418s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4012s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.5407s for 90112 events => throughput is 1.38E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.431830e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.413771e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.434276e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409536e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.3701s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8508s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5193s for 8192 events => throughput is 1.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3915s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8607s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5308s for 8192 events => throughput is 1.54E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 8.0478s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3265s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.7213s for 90112 events => throughput is 1.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.1825s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3348s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8477s for 90112 events => throughput is 1.54E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635240e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.595212e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.638985e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.601206e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.7295s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0334s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6961s for 8192 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7489s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0442s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7047s for 8192 events => throughput is 1.16E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 10.1646s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5113s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.6534s for 90112 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 10.2686s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5215s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.7470s for 90112 events => throughput is 1.16E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.195609e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193007e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.192816e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186876e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8336s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8165s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.79E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8296s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8125s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0170s for 8192 events => throughput is 4.81E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.4746s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1885s for 90112 events => throughput is 4.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4558s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1879s for 90112 events => throughput is 4.79E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.844103e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.821310e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.234617e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.230744e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.148821e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.201290e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.426966e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.426172e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.160825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.204680e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.418503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.412346e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.147643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.157841e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.759279e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.754431e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 82dd2d8d1d..ce966a1838 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:42:29 +DATE: 2024-06-03_23:50:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 2.7113s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3443s - [COUNTERS] Fortran MEs ( 1 ) : 2.3670s for 8192 events => throughput is 3.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7131s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3446s + [COUNTERS] Fortran MEs ( 1 ) : 2.3686s for 8192 events => throughput is 3.46E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.7137s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3424s - [COUNTERS] Fortran MEs ( 1 ) : 2.3712s for 8192 events => throughput is 3.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7082s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3416s + [COUNTERS] Fortran MEs ( 1 ) : 2.3666s for 8192 events => throughput is 3.46E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > / [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 27.8698s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8354s - [COUNTERS] Fortran MEs ( 1 ) : 26.0343s for 90112 events => throughput is 3.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.8958s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8328s + [COUNTERS] Fortran MEs ( 1 ) : 26.0630s for 90112 events => throughput is 3.46E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896785213255034E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896784952157763E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 5.1513s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7119s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4394s for 8192 events => throughput is 3.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.2771s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7695s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5075s for 8192 events => throughput is 3.27E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896785213255034E-007) differ by less than 4E-4 (1.0921373827521563e-06) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896784952157763E-007) differ by less than 4E-4 (1.088869447052332e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -164,15 +164,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668138359550833E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668138450782073E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 31.0280s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1848s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.8433s for 90112 events => throughput is 3.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 31.8193s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2428s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.5764s for 90112 events => throughput is 3.27E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668138359550833E-007) differ by less than 4E-4 (7.148752136920677e-07) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668138450782073E-007) differ by less than 4E-4 (7.160651642745819e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.473180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.389337e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.467630e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.386613e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896766542858863E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.7129s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0222s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6907s for 8192 events => throughput is 1.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7533s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0436s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7097s for 8192 events => throughput is 1.15E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668121906848987E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 10.0727s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4928s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.5799s for 90112 events => throughput is 1.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 10.2875s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5167s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.7708s for 90112 events => throughput is 1.16E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.217938e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177479e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.215916e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192047e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.9442s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6416s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3027s for 8192 events => throughput is 2.71E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9521s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6446s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3075s for 8192 events => throughput is 2.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 5.4581s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1125s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3455s for 90112 events => throughput is 2.69E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4843s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1128s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3715s for 90112 events => throughput is 2.67E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.798829e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.632309e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.800347e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.786027e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8729s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6055s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2674s for 8192 events => throughput is 3.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8876s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6104s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2772s for 8192 events => throughput is 2.96E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 5.0205s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0739s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.9465s for 90112 events => throughput is 3.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.1316s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0816s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.0500s for 90112 events => throughput is 2.95E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.168404e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.102770e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.175648e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.101869e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896778056937195E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.0352s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6880s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3472s for 8192 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0539s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6971s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3568s for 8192 events => throughput is 2.30E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668139178203571E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 5.9908s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1644s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8264s for 90112 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 6.0424s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1598s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8825s for 90112 events => throughput is 2.32E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392336e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.373238e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.396345e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.351931e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -511,15 +511,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896805369365078E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896802503195373E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8232s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8092s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0140s for 8192 events => throughput is 5.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.37E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084444E-007) and cuda (7.9896805369365078E-007) differ by less than 4E-4 (1.3444145174901223e-06) +OK! xsec from fortran (7.9896697955084444E-007) and cuda (7.9896802503195373E-007) differ by less than 4E-4 (1.3085410737190273e-06) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -544,15 +544,15 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668194616292154E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668190930428073E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.4312s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2773s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1539s for 90112 events => throughput is 5.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4347s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2659s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1688s for 90112 events => throughput is 5.34E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438230E-007) and cuda (7.6668194616292154E-007) differ by less than 4E-4 (1.4486452351025747e-06) +OK! xsec from fortran (7.6668083551438230E-007) and cuda (7.6668190930428073E-007) differ by less than 4E-4 (1.400569635601201e-06) *** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.221227e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.888607e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.502146e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.127647e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.377943e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313327e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.384704e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.347449e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.364731e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.319621e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.409606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.347449e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.373672e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.311081e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.829768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.669591e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index d5f006f577..2411f64ea5 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - - make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:44:38 +DATE: 2024-06-03_23:52:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 2.7088s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3456s - [COUNTERS] Fortran MEs ( 1 ) : 2.3632s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7089s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3440s + [COUNTERS] Fortran MEs ( 1 ) : 2.3649s for 8192 events => throughput is 3.46E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.6651s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3397s - [COUNTERS] Fortran MEs ( 1 ) : 2.3253s for 8192 events => throughput is 3.52E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7090s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3410s + [COUNTERS] Fortran MEs ( 1 ) : 2.3680s for 8192 events => throughput is 3.46E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > / [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 27.5341s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8166s - [COUNTERS] Fortran MEs ( 1 ) : 25.7175s for 90112 events => throughput is 3.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.8934s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8328s + [COUNTERS] Fortran MEs ( 1 ) : 26.0605s for 90112 events => throughput is 3.46E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696375074447E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 5.3722s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8085s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5637s for 8192 events => throughput is 3.20E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.3724s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8125s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5599s for 8192 events => throughput is 3.20E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668081976882373E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 32.5074s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2758s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2316s for 90112 events => throughput is 3.19E+03 events/s + [COUNTERS] PROGRAM TOTAL : 32.5052s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2871s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2181s for 90112 events => throughput is 3.19E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.343554e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.327615e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.344274e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.321067e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696285825688E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.9360s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6221s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3139s for 8192 events => throughput is 6.23E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.9600s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6388s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3212s for 8192 events => throughput is 6.20E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668081890954375E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 17.4852s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0946s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.3906s for 90112 events => throughput is 6.26E+03 events/s + [COUNTERS] PROGRAM TOTAL : 17.6265s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1016s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.5249s for 90112 events => throughput is 6.20E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.609788e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.519847e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.608079e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.583575e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.5027s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9146s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5881s for 8192 events => throughput is 1.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.5115s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9209s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5905s for 8192 events => throughput is 1.39E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 8.8508s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3935s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.4573s for 90112 events => throughput is 1.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.8748s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3870s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4878s for 90112 events => throughput is 1.39E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.448003e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.433679e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.440825e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.437201e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.3689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8537s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5151s for 8192 events => throughput is 1.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3594s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5173s for 8192 events => throughput is 1.58E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 7.9986s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3246s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.6740s for 90112 events => throughput is 1.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.0284s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3154s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7130s for 90112 events => throughput is 1.58E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.644369e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.634885e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646050e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635335e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.7590s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0494s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7096s for 8192 events => throughput is 1.15E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7634s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0492s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7142s for 8192 events => throughput is 1.15E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 10.3214s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5233s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.7981s for 90112 events => throughput is 1.16E+04 events/s + [COUNTERS] PROGRAM TOTAL : 10.3734s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5270s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.8464s for 90112 events => throughput is 1.15E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.172848e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160024e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.173704e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.166016e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896697918297644E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8320s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8149s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8347s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8174s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.76E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668083551547592E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.4765s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2872s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1893s for 90112 events => throughput is 4.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4752s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1898s for 90112 events => throughput is 4.75E+05 events/s *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,42 +561,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.819015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.825550e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.183583e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.215700e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.112239e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.114878e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.382293e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.382078e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.110807e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.121155e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.384672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.389119e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.154224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.127818e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.747274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.741522e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index b228907f76..4cb39399b2 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:35 +DATE: 2024-06-03_23:47:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6577s - [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6691s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6597s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3832s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3926s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.78E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4414s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3401s - [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4460s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3448s + [COUNTERS] Fortran MEs ( 1 ) : 0.1011s for 90112 events => throughput is 8.91E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 7dc3f4f16e..9a6633a8ba 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,22 +1,22 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:40 +DATE: 2024-06-03_23:47:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6610s - [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6661s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6568s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.75E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s - [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3918s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3824s + [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.78E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4420s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3407s - [COUNTERS] Fortran MEs ( 1 ) : 0.1013s for 90112 events => throughput is 8.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4402s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3391s + [COUNTERS] Fortran MEs ( 1 ) : 0.1011s for 90112 events => throughput is 8.91E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 89c65c015c..986149f14d 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:45 +DATE: 2024-06-03_23:47:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1767 events (found 4306 events) - [COUNTERS] PROGRAM TOTAL : 0.6710s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6616s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6652s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6557s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0 [UNWEIGHT] Wrote 1636 events (found 1641 events) - [COUNTERS] PROGRAM TOTAL : 0.3927s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s - [COUNTERS] Fortran MEs ( 1 ) : 0.0093s for 8192 events => throughput is 8.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3893s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0 [UNWEIGHT] Wrote 1828 events (found 1833 events) - [COUNTERS] PROGRAM TOTAL : 1.4406s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3394s - [COUNTERS] Fortran MEs ( 1 ) : 0.1012s for 90112 events => throughput is 8.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4476s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3462s + [COUNTERS] Fortran MEs ( 1 ) : 0.1014s for 90112 events => throughput is 8.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index c3b7dfd598..70027dc2e0 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,10 +1,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=cuda - +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:17 +DATE: 2024-06-03_23:46:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,8 +58,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.8317s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7874s + [COUNTERS] PROGRAM TOTAL : 0.8354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7911s [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4203s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3760s - [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4255s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3811s + [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.8213s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3345s - [COUNTERS] Fortran MEs ( 1 ) : 0.4868s for 90112 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8207s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3348s + [COUNTERS] Fortran MEs ( 1 ) : 0.4859s for 90112 events => throughput is 1.85E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,12 +131,473 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 171.8 [171.81273026311101] fbridge_mode=1 - [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.7094s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6690s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0404s for 8192 events => throughput is 2.03E+05 events/s + [XSECTION] Cross section = 44.6 [44.598860065419863] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4668s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4210s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0458s for 8192 events => throughput is 1.79E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -ERROR! xsec from fortran (44.598860065419856) and cpp (171.81273026311101) differ by more than 3E-14 (2.852401832941188) +OK! xsec from fortran (44.598860065419856) and cpp (44.598860065419863) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577523870256471] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.8456s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3395s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5061s for 90112 events => throughput is 1.78E+05 events/s + +*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577523870256471) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.826181e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.832182e+05 ) sec^-1 + +*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4287s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4030s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598860065419856) differ by less than 3E-14 (0.0) + +*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577523870256471] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.6007s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2851s for 90112 events => throughput is 3.16E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577523870256471) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.247701e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.267024e+05 ) sec^-1 + +*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4078s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3919s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.14E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598860065419856) differ by less than 3E-14 (0.0) + +*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.4943s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3116s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1826s for 90112 events => throughput is 4.93E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577523870256485) differ by less than 3E-14 (6.661338147750939e-16) + +*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.478211e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.069207e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4060s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3913s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.59E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598860065419856) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.4746s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3115s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1631s for 90112 events => throughput is 5.53E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577523870256485) differ by less than 3E-14 (6.661338147750939e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.555567e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.463438e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4230s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3993s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0237s for 8192 events => throughput is 3.45E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598860065419856) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.5844s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3210s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2635s for 90112 events => throughput is 3.42E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577523870256485) differ by less than 3E-14 (6.661338147750939e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.500188e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.549800e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.8119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8113s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.44E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cuda (44.598860065419849) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.7404s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7337s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cuda (44.577523870256485) differ by less than 3E-14 (6.661338147750939e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.139272e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.627494e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.321194e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.077683e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.312625e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152217e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.316393e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.083910e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 72c1ceb733..72c90c66fe 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -3,8 +3,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/s make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -13,18 +13,18 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:23 +DATE: 2024-06-03_23:46:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,8 +58,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.8348s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7904s + [COUNTERS] PROGRAM TOTAL : 0.8335s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7892s [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4224s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3780s - [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4272s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3829s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.8224s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3351s - [COUNTERS] Fortran MEs ( 1 ) : 0.4873s for 90112 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8213s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3346s + [COUNTERS] Fortran MEs ( 1 ) : 0.4867s for 90112 events => throughput is 1.85E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,12 +131,473 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 171.8 [171.81270286137041] fbridge_mode=1 - [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.7073s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6699s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0374s for 8192 events => throughput is 2.19E+05 events/s + [XSECTION] Cross section = 44.6 [44.598853620719339] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4672s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4241s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -ERROR! xsec from fortran (44.598860065419856) and cpp (171.81270286137041) differ by more than 4E-4 (2.8524012185366816) +OK! xsec from fortran (44.598860065419856) and cpp (44.598853620719339) differ by less than 4E-4 (1.4450370500185272e-07) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577522280119403] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.8122s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3368s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4754s for 90112 events => throughput is 1.90E+05 events/s + +*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577522280119403) differ by less than 4E-4 (3.567127371262302e-08) + +*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.936406e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.938322e+05 ) sec^-1 + +*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598849697851406] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3973s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0175s for 8192 events => throughput is 4.67E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598849697851406) differ by less than 4E-4 (2.3246263325393812e-07) + +*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577518590213366] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.5105s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3150s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1955s for 90112 events => throughput is 4.61E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577518590213366) differ by less than 4E-4 (1.1844630731783212e-07) + +*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.616144e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.702667e+05 ) sec^-1 + +*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598850036412124] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3970s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.58E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598850036412124) differ by less than 4E-4 (2.2487139172966408e-07) + +*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577518612400254] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.4052s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3022s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1030s for 90112 events => throughput is 8.75E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577518612400254) differ by less than 4E-4 (1.1794859255953583e-07) + +*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.629870e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.707913e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598850036412124] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3932s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3845s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0087s for 8192 events => throughput is 9.45E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598850036412124) differ by less than 4E-4 (2.2487139172966408e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577518612400254] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.4049s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3058s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0991s for 90112 events => throughput is 9.09E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577518612400254) differ by less than 4E-4 (1.1794859255953583e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.083136e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.144924e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598854350242270] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4052s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3927s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.55E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598854350242270) differ by less than 4E-4 (1.2814627048385319e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577522751628507] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.4528s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3153s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1375s for 90112 events => throughput is 6.55E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577522751628507) differ by less than 4E-4 (2.5093990219104967e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.787403e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.856099e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598870301426373] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.8121s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8116s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cuda (44.598870301426373) differ by less than 4E-4 (2.2951273881410827e-07) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577527268256027] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.7384s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7321s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cuda (44.577527268256027) differ by less than 4E-4 (7.622674558227516e-08) + +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.171187e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.169864e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.060503e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.394999e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.009567e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.550837e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.587863e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.478184e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 21fd4d6bec..9a29cd921e 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,8 +1,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx - make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 @@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-06-03_02:39:29 +DATE: 2024-06-03_23:47:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.8339s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7896s - [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7885s + [COUNTERS] Fortran MEs ( 1 ) : 0.0442s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3787s - [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4210s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3766s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.8213s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3354s - [COUNTERS] Fortran MEs ( 1 ) : 0.4859s for 90112 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8161s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3307s + [COUNTERS] Fortran MEs ( 1 ) : 0.4855s for 90112 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -131,12 +131,473 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 171.8 [171.81273490068889] fbridge_mode=1 - [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.7067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6664s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0403s for 8192 events => throughput is 2.03E+05 events/s + [XSECTION] Cross section = 44.6 [44.598861353577519] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4208s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0465s for 8192 events => throughput is 1.76E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -ERROR! xsec from fortran (44.598860065419856) and cpp (171.81273490068889) differ by more than 2E-4 (2.8524019369254128) +OK! xsec from fortran (44.598860065419856) and cpp (44.598861353577519) differ by less than 2E-4 (2.888319694527297e-08) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577525144126803] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.8488s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3374s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5114s for 90112 events => throughput is 1.76E+05 events/s + +*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577525144126803) differ by less than 2E-4 (2.8576516486467085e-08) + +*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.809323e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.808510e+05 ) sec^-1 + +*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598861353577519] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4059s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0274s for 8192 events => throughput is 2.98E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598861353577519) differ by less than 2E-4 (2.888319694527297e-08) + +*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577525144126810] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.6150s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3231s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2919s for 90112 events => throughput is 3.09E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577525144126810) differ by less than 2E-4 (2.857651670851169e-08) + +*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.270237e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.295470e+05 ) sec^-1 + +*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598861344883289] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3911s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0158s for 8192 events => throughput is 5.17E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598861344883289) differ by less than 2E-4 (2.868825421664667e-08) + +*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577525178109212] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.4812s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3060s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1752s for 90112 events => throughput is 5.14E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577525178109212) differ by less than 2E-4 (2.9338838025694258e-08) + +*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.216654e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.048168e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598861344883289] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4047s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3902s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.64E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598861344883289) differ by less than 2E-4 (2.868825421664667e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577525178109212] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.4663s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3049s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1614s for 90112 events => throughput is 5.58E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577525178109212) differ by less than 2E-4 (2.9338838025694258e-08) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.619505e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.636838e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598861344883289] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4013s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 8192 events => throughput is 3.57E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cpp (44.598861344883289) differ by less than 2E-4 (2.868825421664667e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577525178109212] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.5699s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3166s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2533s for 90112 events => throughput is 3.56E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cpp (44.577525178109212) differ by less than 2E-4 (2.9338838025694258e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.647783e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.630056e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.6 [44.598860056955807] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.8132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8126s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.598860065419856) and cuda (44.598860056955807) differ by less than 2E-4 (1.8978174587402918e-10) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x10_cudacpp > /tmp/avalassi/output_susyggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.58 [44.577523872560512] fbridge_mode=1 + [UNWEIGHT] Wrote 1743 events (found 1748 events) + [COUNTERS] PROGRAM TOTAL : 1.7355s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7288s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.577523870256456) and cuda (44.577523872560512) differ by less than 2E-4 (5.168643291142416e-11) + +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.178260e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.610652e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.335730e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.073459e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.323128e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.150707e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.332625e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.020051e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** + +TEST COMPLETED From f7b9e0438142db9d4862aad5551e5cc6a51ed971 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 12:17:10 +0200 Subject: [PATCH 25/33] [tmad] regenerate all processes after merging upstream/master (new CI #794 and valgrind fixes #869): no change in the code --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 18 +++--- .../CODEGEN_cudacpp_ee_mumu_log.txt | 12 ++-- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 18 +++--- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 ++-- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 24 ++++---- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 20 +++---- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 12 ++-- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 22 +++---- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 14 ++--- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 22 +++---- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 16 ++--- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 24 ++++---- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 14 ++--- .../CODEGEN_mad_heft_gg_bb_log.txt | 16 ++--- .../CODEGEN_cudacpp_heft_gg_bb_log.txt | 15 +++-- .../CODEGEN_mad_pp_tt012j_log.txt | 60 +++++++++---------- .../CODEGEN_mad_smeft_gg_tttt_log.txt | 20 +++---- .../CODEGEN_cudacpp_smeft_gg_tttt_log.txt | 16 ++--- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 18 +++--- .../CODEGEN_cudacpp_susy_gg_t1t1_log.txt | 14 ++--- .../CODEGEN_mad_susy_gg_tt_log.txt | 20 +++---- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 15 +++-- 22 files changed, 215 insertions(+), 207 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 4ec188cadf..28e184bf78 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005849361419677734  +DEBUG: model prefixing takes 0.005658626556396484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,19 +194,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.105 s +Wrote files for 8 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.211 s +ALOHA: aloha creates 3 routines in 0.205 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.269 s +ALOHA: aloha creates 7 routines in 0.255 s FFV1 FFV1 FFV2 @@ -252,9 +252,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.156s -user 0m1.872s -sys 0m0.271s +real 0m2.059s +user 0m1.790s +sys 0m0.256s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index c64f078d3d..0fe131292c 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005849123001098633  +DEBUG: model prefixing takes 0.005565166473388672  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT @@ -184,7 +184,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.287 s +ALOHA: aloha creates 4 routines in 0.267 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.786s -user 0m0.646s -sys 0m0.059s +real 0m0.925s +user 0m0.656s +sys 0m0.078s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index cfb6edb459..31b0aa1105 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005819082260131836  +DEBUG: model prefixing takes 0.005632162094116211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,16 +194,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.108 s +Wrote files for 10 helas calls in 0.103 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.155 s +ALOHA: aloha creates 2 routines in 0.145 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.140 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -241,9 +241,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.985s -user 0m1.713s -sys 0m0.272s +real 0m1.885s +user 0m1.625s +sys 0m0.252s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index f936b93678..09c303ae9f 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058116912841796875  +DEBUG: model prefixing takes 0.005680561065673828  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -183,7 +183,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.152 s +ALOHA: aloha creates 2 routines in 0.142 s VVV1 FFV1 FFV1 @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.570s -user 0m0.504s -sys 0m0.055s +real 0m0.528s +user 0m0.469s +sys 0m0.053s Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 356f172947..2c4436e94e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005854129791259766  +DEBUG: model prefixing takes 0.0052280426025390625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -188,7 +188,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -205,7 +205,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -220,15 +220,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s -Wrote files for 46 helas calls in 0.262 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.243 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.352 s +ALOHA: aloha creates 5 routines in 0.326 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -236,7 +236,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.336 s +ALOHA: aloha creates 10 routines in 0.315 s VVV1 VVV1 FFV1 @@ -287,9 +287,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.781s -user 0m2.445s -sys 0m0.324s +real 0m2.695s +user 0m2.298s +sys 0m0.313s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 89d5e42a2e..d9854701b1 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005921602249145508  +DEBUG: model prefixing takes 0.005344390869140625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -Wrote files for 36 helas calls in 0.158 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s +Wrote files for 36 helas calls in 0.149 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.347 s +ALOHA: aloha creates 5 routines in 0.323 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.332 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -256,9 +256,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.587s -user 0m2.277s -sys 0m0.294s +real 0m2.576s +user 0m2.182s +sys 0m0.268s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index f7ba14f214..e85b70fcb8 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058133602142333984  +DEBUG: model prefixing takes 0.005228757858276367  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.022 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.346 s +ALOHA: aloha creates 5 routines in 0.326 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.827s -user 0m0.769s -sys 0m0.053s +real 0m0.782s +user 0m0.718s +sys 0m0.054s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 849d5b6525..2262ebb5b7 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005772829055786133  +DEBUG: model prefixing takes 0.0058100223541259766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.169 s +1 processes with 123 diagrams generated in 0.166 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.456 s -Wrote files for 222 helas calls in 0.738 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.434 s +Wrote files for 222 helas calls in 0.688 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.352 s +ALOHA: aloha creates 5 routines in 0.328 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.342 s +ALOHA: aloha creates 10 routines in 0.311 s VVV1 VVV1 FFV1 @@ -259,10 +259,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m5.259s -user 0m3.713s -sys 0m0.289s -Code generation completed in 5 seconds +real 0m4.141s +user 0m3.506s +sys 0m0.286s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 903ee6a21e..17ef6a38d7 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005794525146484375  +DEBUG: model prefixing takes 0.005803346633911133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.170 s +1 processes with 123 diagrams generated in 0.155 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.455 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.337 s +ALOHA: aloha creates 5 routines in 0.318 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.530s -user 0m1.456s -sys 0m0.061s +real 0m1.427s +user 0m1.363s +sys 0m0.053s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 476a42fed0..788b228ebb 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005855083465576172  +DEBUG: model prefixing takes 0.005652427673339844  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.990 s +1 processes with 1240 diagrams generated in 1.860 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,15 +195,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.986 s -Wrote files for 2281 helas calls in 19.681 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.627 s +Wrote files for 2281 helas calls in 18.541 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.338 s +ALOHA: aloha creates 5 routines in 0.321 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -211,7 +211,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.332 s +ALOHA: aloha creates 10 routines in 0.317 s VVV1 VVV1 FFV1 @@ -261,10 +261,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m34.814s -user 0m34.082s -sys 0m0.442s -Code generation completed in 35 seconds +real 0m32.695s +user 0m32.098s +sys 0m0.486s +Code generation completed in 33 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index ee8521e020..f379a1b9f3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058100223541259766  +DEBUG: model prefixing takes 0.005811452865600586  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.992 s +1 processes with 1240 diagrams generated in 1.876 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 7.031 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.539 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.372 s +ALOHA: aloha creates 5 routines in 0.348 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.928s -user 0m13.725s -sys 0m0.126s -Code generation completed in 14 seconds +real 0m12.969s +user 0m12.807s +sys 0m0.111s +Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 44f0debd5b..0291728dfd 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005815982818603516  +DEBUG: model prefixing takes 0.005555152893066406  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.083 s +8 processes with 40 diagrams generated in 0.079 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -201,7 +201,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -218,7 +218,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -233,17 +233,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s -Wrote files for 32 helas calls in 0.232 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s +Wrote files for 32 helas calls in 0.220 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.156 s +ALOHA: aloha creates 2 routines in 0.152 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.143 s +ALOHA: aloha creates 4 routines in 0.132 s FFV1 FFV1 FFV1 @@ -298,10 +298,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.796s -user 0m2.043s -sys 0m0.293s -Code generation completed in 4 seconds +real 0m2.226s +user 0m1.914s +sys 0m0.300s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index d9ee132bf2..c98a663ee1 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005840778350830078  +DEBUG: model prefixing takes 0.005281686782836914  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.083 s +8 processes with 40 diagrams generated in 0.078 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -210,12 +210,12 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.153 s +ALOHA: aloha creates 2 routines in 0.150 s FFV1 FFV1 FFV1 @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.702s -user 0m0.619s -sys 0m0.058s +real 0m0.654s +user 0m0.598s +sys 0m0.048s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index b2a820e487..5855fc03a9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -150,7 +150,7 @@ INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Creating files in directory P1_gg_bbx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -165,21 +165,21 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s -Wrote files for 12 helas calls in 0.111 s +Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s +Wrote files for 12 helas calls in 0.104 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.279 s +ALOHA: aloha creates 4 routines in 0.264 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.262 s +ALOHA: aloha creates 8 routines in 0.252 s VVS3 VVV1 FFV1 @@ -219,9 +219,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.250s -user 0m1.965s -sys 0m0.278s +real 0m2.127s +user 0m1.875s +sys 0m0.254s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index dce6286d40..634234e668 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -62,6 +62,11 @@ set auto_convert_model T save options auto_convert_model save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft +INFO: load particles +INFO: load vertices +WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +DEBUG: model prefixing takes 0.005600929260253906  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -157,7 +162,7 @@ ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.279 s +ALOHA: aloha creates 4 routines in 0.262 s VVS3 VVV1 FFV1 @@ -174,7 +179,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m2.095s -user 0m0.614s -sys 0m0.059s -Code generation completed in 2 seconds +real 0m0.659s +user 0m0.602s +sys 0m0.049s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 89b4bd51fa..594ff6aa9f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00579380989074707  +DEBUG: model prefixing takes 0.005584716796875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.031 s +5 processes with 7 diagrams generated in 0.030 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.144 s +13 processes with 76 diagrams generated in 0.136 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.940 s +65 processes with 1119 diagrams generated in 1.809 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -500,7 +500,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -517,7 +517,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -534,7 +534,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -551,7 +551,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -568,7 +568,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -585,7 +585,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -602,7 +602,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -619,7 +619,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -636,7 +636,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -653,7 +653,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -670,7 +670,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -687,7 +687,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -704,7 +704,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -721,7 +721,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -738,7 +738,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -755,7 +755,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -772,7 +772,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -789,7 +789,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,15 +804,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.383 s -Wrote files for 810 helas calls in 4.486 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.282 s +Wrote files for 810 helas calls in 3.249 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.361 s +ALOHA: aloha creates 5 routines in 0.338 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -820,7 +820,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.329 s +ALOHA: aloha creates 10 routines in 0.321 s VVV1 VVV1 FFV1 @@ -1032,10 +1032,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m13.329s -user 0m10.572s -sys 0m0.946s -Code generation completed in 13 seconds +real 0m10.903s +user 0m9.970s +sys 0m0.885s +Code generation completed in 10 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 577a53a429..06ef0911b1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.14798188209533691  +DEBUG: model prefixing takes 0.1368861198425293  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.952 s +1 processes with 72 diagrams generated in 3.790 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 INFO: Creating files in directory P1_gg_ttxttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -130,15 +130,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -Generated helas calls for 1 subprocesses (72 diagrams) in 0.200 s -Wrote files for 119 helas calls in 0.449 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.186 s +Wrote files for 119 helas calls in 0.413 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.337 s +ALOHA: aloha creates 5 routines in 0.313 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -146,7 +146,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.351 s +ALOHA: aloha creates 10 routines in 0.327 s VVV5 VVV5 FFV1 @@ -193,9 +193,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m8.031s -user 0m7.273s -sys 0m0.312s +real 0m7.219s +user 0m6.902s +sys 0m0.297s Code generation completed in 8 seconds ************************************************************ * * diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 159f44d59b..fe99eba416 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.14763712882995605  +DEBUG: model prefixing takes 0.13751649856567383  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.951 s +1 processes with 72 diagrams generated in 3.708 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.198 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.189 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -123,7 +123,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.336 s +ALOHA: aloha creates 5 routines in 0.322 s VVV5 VVV5 FFV1 @@ -143,7 +143,7 @@ INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SME INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m5.433s -user 0m5.333s -sys 0m0.069s -Code generation completed in 6 seconds +real 0m5.106s +user 0m5.014s +sys 0m0.067s +Code generation completed in 5 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 3d8fbc32ca..664aaed428 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.131 s +1 processes with 6 diagrams generated in 0.124 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,19 +592,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s -Wrote files for 16 helas calls in 0.117 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s +Wrote files for 16 helas calls in 0.112 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.198 s +ALOHA: aloha creates 3 routines in 0.183 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.192 s +ALOHA: aloha creates 6 routines in 0.183 s VVV1 VSS1 VSS1 @@ -647,9 +647,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.497s -user 0m2.824s -sys 0m0.326s +real 0m2.986s +user 0m2.676s +sys 0m0.299s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 829b9ab7c7..bc0d525a89 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.131 s +1 processes with 6 diagrams generated in 0.123 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Load PLUGIN.CUDACPP_OUTPUT @@ -577,13 +577,13 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.203 s +ALOHA: aloha creates 3 routines in 0.181 s VVV1 VSS1 VSS1 @@ -599,7 +599,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.431s -user 0m1.338s -sys 0m0.069s -Code generation completed in 1 seconds +real 0m1.313s +user 0m1.239s +sys 0m0.063s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 6a8c405b07..9969a68175 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.127 s +1 processes with 3 diagrams generated in 0.126 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,17 +592,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.110 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Wrote files for 10 helas calls in 0.107 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.135 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.141 s +ALOHA: aloha creates 4 routines in 0.131 s VVV1 FFV1 FFV1 @@ -640,10 +640,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.015s -user 0m2.709s -sys 0m0.297s -Code generation completed in 3 seconds +real 0m4.141s +user 0m2.577s +sys 0m0.292s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 86c6a6a716..57c49445c0 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -59,6 +59,9 @@ set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 +INFO: load particles +INFO: load vertices +DEBUG: model prefixing takes 0.9004595279693604  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -554,7 +557,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.126 s +1 processes with 3 diagrams generated in 0.109 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -582,7 +585,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.136 s VVV1 FFV1 FFV1 @@ -597,7 +600,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.401s -user 0m1.289s -sys 0m0.064s -Code generation completed in 2 seconds +real 0m2.351s +user 0m2.265s +sys 0m0.070s +Code generation completed in 3 seconds From 1626858877cc77eda2b0c26bbe99b628d6ab926c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 15:16:34 +0200 Subject: [PATCH 26/33] [tmad] in gg_tt.mad aloha_functions.f, improve the comment about #855 (prepare to move upstream to mg5amcnlo gpucpp) --- epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) From 720ae02e537b953184dd27ea12812f3ba32b3598 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 15:21:22 +0200 Subject: [PATCH 27/33] [tmad] update mg5amcnlo to f274cab55, adding volatile to prevent #855 crashes in rotxxx (move this upstream as suggested by Olivier) --- MG5aMC/mg5amcnlo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index 0b8678984c..f274cab55d 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit 0b8678984c21f49af9690594344ee53b6abc38e3 +Subproject commit f274cab55d5d983c5612ca7ab3417ee796aa1a8c From 3f6f6474b955877d41235d9208d3a816a049e086 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 15:23:28 +0200 Subject: [PATCH 28/33] [tmad] in CODEGEN patch.common, remove 'volatile' patch in rotxxx (this is now upstream in mg5amcnlo as suggested by Olivier) NB: patches should now be generated without including aloha_functions.f: ./CODEGEN/generateAndCompare.sh gg_tt --mad --nopatch git diff --no-ext-diff -R gg_tt.mad/Source/makefile gg_tt.mad/Source/dsample.f gg_tt.mad/Source/genps.inc gg_tt.mad/SubProcesses/makefile > CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common git diff --no-ext-diff -R gg_tt.mad/bin/internal/banner.py gg_tt.mad/bin/internal/gen_ximprove.py gg_tt.mad/bin/internal/madevent_interface.py >> CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common git diff --no-ext-diff -R gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f > CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 git checkout gg_tt.mad --- .../MG5aMC_patches/PROD/patch.common | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common index a144380912..3cfcc909d9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common @@ -1,16 +1,3 @@ -diff --git b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f -index 657387a58..d0ec1dbde 100644 ---- b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f -+++ a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_functions.f -@@ -1201,7 +1201,7 @@ c real prot(0:3) : four-momentum p in the rotated frame - c - implicit none - double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 -- -+ volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) - double precision rZero, rOne - parameter( rZero = 0.0d0, rOne = 1.0d0 ) - diff --git b/epochX/cudacpp/gg_tt.mad/Source/genps.inc a/epochX/cudacpp/gg_tt.mad/Source/genps.inc index a59181c70..af7e0efbc 100644 --- b/epochX/cudacpp/gg_tt.mad/Source/genps.inc From c0ab3f9a09b4595b0328f42cbba604a132cc2a2e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 15:24:47 +0200 Subject: [PATCH 29/33] [tmad] regenerate gg_tt.mad, check that all is ok after moving the volatile patch from patch.common to mg5amcnlo upstream --- .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 31b0aa1105..5d7d2be181 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005632162094116211  +DEBUG: model prefixing takes 0.005706310272216797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,16 +194,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.101 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.142 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.131 s VVV1 FFV1 FFV1 @@ -224,7 +224,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -241,9 +240,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.885s -user 0m1.625s -sys 0m0.252s +real 0m1.988s +user 0m1.706s +sys 0m0.263s Code generation completed in 2 seconds ************************************************************ * * From 39647605a7b6a2a70cad2f0e1edaeb71cc053f2c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 16:00:41 +0200 Subject: [PATCH 30/33] [tmad] regenerate all processes - no change in the code, except for the comment about volatile --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 15 +++-- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 ++-- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 ++--- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 ++-- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 21 ++++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 21 ++++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 14 ++--- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 21 ++++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 14 ++--- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 23 ++++---- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 14 ++--- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 23 ++++---- .../Source/DHELAS/aloha_functions.f | 2 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 16 ++--- .../CODEGEN_mad_heft_gg_bb_log.txt | 17 +++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_heft_gg_bb_log.txt | 15 ++--- .../CODEGEN_mad_pp_tt012j_log.txt | 59 +++++++++---------- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_mad_smeft_gg_tttt_log.txt | 23 ++++---- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_smeft_gg_tttt_log.txt | 14 ++--- .../CODEGEN_mad_susy_gg_t1t1_log.txt | 17 +++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_susy_gg_t1t1_log.txt | 14 ++--- .../CODEGEN_mad_susy_gg_tt_log.txt | 19 +++--- .../Source/DHELAS/aloha_functions.f | 2 +- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 15 ++--- 33 files changed, 208 insertions(+), 227 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 28e184bf78..15d7641fe1 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005658626556396484  +DEBUG: model prefixing takes 0.005683183670043945  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,19 +194,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.100 s +Wrote files for 8 helas calls in 0.099 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.205 s +ALOHA: aloha creates 3 routines in 0.197 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.255 s +ALOHA: aloha creates 7 routines in 0.258 s FFV1 FFV1 FFV2 @@ -231,7 +231,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -252,9 +251,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.059s +real 0m2.034s user 0m1.790s -sys 0m0.256s +sys 0m0.231s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 0fe131292c..b4fd2fd3b3 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005565166473388672  +DEBUG: model prefixing takes 0.005368709564208984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -184,7 +184,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.267 s +ALOHA: aloha creates 4 routines in 0.265 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.925s -user 0m0.656s -sys 0m0.078s +real 0m0.653s +user 0m0.596s +sys 0m0.048s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 5d7d2be181..5d37596ba7 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005706310272216797  +DEBUG: model prefixing takes 0.00545048713684082  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,16 +194,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.101 s +Wrote files for 10 helas calls in 0.103 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.142 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.131 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -240,9 +240,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.988s -user 0m1.706s -sys 0m0.263s +real 0m1.891s +user 0m1.634s +sys 0m0.251s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 09c303ae9f..2b17f9f229 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005680561065673828  +DEBUG: model prefixing takes 0.00540924072265625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -183,7 +183,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.142 s +ALOHA: aloha creates 2 routines in 0.144 s VVV1 FFV1 FFV1 @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.528s -user 0m0.469s -sys 0m0.053s -Code generation completed in 0 seconds +real 0m0.547s +user 0m0.473s +sys 0m0.059s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 2c4436e94e..a479d2daa3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0052280426025390625  +DEBUG: model prefixing takes 0.005651235580444336  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.019 s +1 processes with 16 diagrams generated in 0.020 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -188,7 +188,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -205,7 +205,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -221,14 +221,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s -Wrote files for 46 helas calls in 0.243 s +Wrote files for 46 helas calls in 0.244 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.326 s +ALOHA: aloha creates 5 routines in 0.324 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -236,7 +236,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.315 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -262,7 +262,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -287,9 +286,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.695s -user 0m2.298s -sys 0m0.313s +real 0m2.619s +user 0m2.309s +sys 0m0.296s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index d9854701b1..c843ce6664 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005344390869140625  +DEBUG: model prefixing takes 0.0052797794342041016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -Wrote files for 36 helas calls in 0.149 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Wrote files for 36 helas calls in 0.150 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.323 s +ALOHA: aloha creates 5 routines in 0.327 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.309 s +ALOHA: aloha creates 10 routines in 0.312 s VVV1 VVV1 FFV1 @@ -235,7 +235,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -256,10 +255,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.576s -user 0m2.182s -sys 0m0.268s -Code generation completed in 3 seconds +real 0m2.423s +user 0m2.148s +sys 0m0.276s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index e85b70fcb8..6c10c486e2 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005228757858276367  +DEBUG: model prefixing takes 0.005514383316040039  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.326 s +ALOHA: aloha creates 5 routines in 0.325 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.782s -user 0m0.718s -sys 0m0.054s +real 0m0.778s +user 0m0.707s +sys 0m0.061s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 2262ebb5b7..d3e677cd6e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058100223541259766  +DEBUG: model prefixing takes 0.005806922912597656  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.166 s +1 processes with 123 diagrams generated in 0.170 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.434 s -Wrote files for 222 helas calls in 0.688 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s +Wrote files for 222 helas calls in 0.687 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.328 s +ALOHA: aloha creates 5 routines in 0.326 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.311 s +ALOHA: aloha creates 10 routines in 0.312 s VVV1 VVV1 FFV1 @@ -238,7 +238,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -259,9 +258,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m4.141s -user 0m3.506s -sys 0m0.286s +real 0m3.794s +user 0m3.521s +sys 0m0.268s Code generation completed in 4 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 17ef6a38d7..c72f0d5ba4 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005803346633911133  +DEBUG: model prefixing takes 0.005587577819824219  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.155 s +1 processes with 123 diagrams generated in 0.156 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.318 s +ALOHA: aloha creates 5 routines in 0.320 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.427s -user 0m1.363s -sys 0m0.053s -Code generation completed in 1 seconds +real 0m1.431s +user 0m1.367s +sys 0m0.056s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 788b228ebb..134d0547e0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005652427673339844  +DEBUG: model prefixing takes 0.0053365230560302734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.860 s +1 processes with 1240 diagrams generated in 1.873 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,15 +195,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.627 s -Wrote files for 2281 helas calls in 18.541 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s +Wrote files for 2281 helas calls in 18.429 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.321 s +ALOHA: aloha creates 5 routines in 0.320 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -211,7 +211,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.317 s +ALOHA: aloha creates 10 routines in 0.315 s VVV1 VVV1 FFV1 @@ -240,7 +240,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -261,10 +260,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m32.695s -user 0m32.098s -sys 0m0.486s -Code generation completed in 33 seconds +real 0m32.528s +user 0m31.969s +sys 0m0.456s +Code generation completed in 32 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index f379a1b9f3..0e29de8efe 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005811452865600586  +DEBUG: model prefixing takes 0.005451679229736328  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.876 s +1 processes with 1240 diagrams generated in 1.880 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.539 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.514 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.348 s +ALOHA: aloha creates 5 routines in 0.343 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m12.969s -user 0m12.807s -sys 0m0.111s +real 0m12.954s +user 0m12.797s +sys 0m0.105s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 0291728dfd..7dfcadb211 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005555152893066406  +DEBUG: model prefixing takes 0.005356311798095703  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.079 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -201,7 +201,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -218,7 +218,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -234,16 +234,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -Wrote files for 32 helas calls in 0.220 s +Wrote files for 32 helas calls in 0.218 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.152 s +ALOHA: aloha creates 2 routines in 0.143 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.131 s FFV1 FFV1 FFV1 @@ -265,7 +265,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -298,10 +297,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.226s -user 0m1.914s -sys 0m0.300s -Code generation completed in 3 seconds +real 0m2.201s +user 0m1.903s +sys 0m0.291s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index c98a663ee1..6496c26a2e 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005281686782836914  +DEBUG: model prefixing takes 0.005724191665649414  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -210,12 +210,12 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.150 s +ALOHA: aloha creates 2 routines in 0.143 s FFV1 FFV1 FFV1 @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.654s -user 0m0.598s -sys 0m0.048s -Code generation completed in 1 seconds +real 0m0.644s +user 0m0.581s +sys 0m0.057s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index 5855fc03a9..8e47e032d0 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -150,7 +150,7 @@ INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Creating files in directory P1_gg_bbx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -165,21 +165,21 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s -Wrote files for 12 helas calls in 0.104 s +Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s +Wrote files for 12 helas calls in 0.107 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.264 s +ALOHA: aloha creates 4 routines in 0.263 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.252 s +ALOHA: aloha creates 8 routines in 0.249 s VVS3 VVV1 FFV1 @@ -202,7 +202,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -219,9 +218,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.127s -user 0m1.875s -sys 0m0.254s +real 0m2.124s +user 0m1.871s +sys 0m0.250s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 634234e668..fd17ced3f2 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -62,11 +62,6 @@ set auto_convert_model T save options auto_convert_model save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: load particles -INFO: load vertices -WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.005600929260253906  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -162,7 +157,7 @@ ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.262 s +ALOHA: aloha creates 4 routines in 0.257 s VVS3 VVV1 FFV1 @@ -179,7 +174,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.659s -user 0m0.602s -sys 0m0.049s -Code generation completed in 0 seconds +real 0m0.733s +user 0m0.574s +sys 0m0.047s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 594ff6aa9f..36cdf152ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005584716796875  +DEBUG: model prefixing takes 0.005532741546630859  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.809 s +65 processes with 1119 diagrams generated in 1.814 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -500,7 +500,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -517,7 +517,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -534,7 +534,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -551,7 +551,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -568,7 +568,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -585,7 +585,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -602,7 +602,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -619,7 +619,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -636,7 +636,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -653,7 +653,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -670,7 +670,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -687,7 +687,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -704,7 +704,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -721,7 +721,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -738,7 +738,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -755,7 +755,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -772,7 +772,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -789,7 +789,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,15 +804,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.282 s -Wrote files for 810 helas calls in 3.249 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.274 s +Wrote files for 810 helas calls in 3.287 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.338 s +ALOHA: aloha creates 5 routines in 0.341 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -820,7 +820,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.321 s +ALOHA: aloha creates 10 routines in 0.312 s VVV1 VVV1 FFV1 @@ -849,7 +849,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -1032,10 +1031,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m10.903s -user 0m9.970s -sys 0m0.885s -Code generation completed in 10 seconds +real 0m10.934s +user 0m9.967s +sys 0m0.904s +Code generation completed in 11 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/pp_tt012j.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 06ef0911b1..b09bcf57e4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.1368861198425293  +DEBUG: model prefixing takes 0.13876676559448242  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.790 s +1 processes with 72 diagrams generated in 3.667 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 INFO: Creating files in directory P1_gg_ttxttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -130,15 +130,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -Generated helas calls for 1 subprocesses (72 diagrams) in 0.186 s -Wrote files for 119 helas calls in 0.413 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.185 s +Wrote files for 119 helas calls in 0.412 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.313 s +ALOHA: aloha creates 5 routines in 0.319 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -146,7 +146,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.327 s +ALOHA: aloha creates 10 routines in 0.330 s VVV5 VVV5 FFV1 @@ -172,7 +172,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -193,10 +192,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m7.219s -user 0m6.902s -sys 0m0.297s -Code generation completed in 8 seconds +real 0m7.121s +user 0m6.793s +sys 0m0.302s +Code generation completed in 7 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index fe99eba416..971140c7c5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.13751649856567383  +DEBUG: model prefixing takes 0.13476824760437012  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.708 s +1 processes with 72 diagrams generated in 3.685 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.189 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.192 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -123,7 +123,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.322 s +ALOHA: aloha creates 5 routines in 0.333 s VVV5 VVV5 FFV1 @@ -143,7 +143,7 @@ INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SME INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m5.106s -user 0m5.014s -sys 0m0.067s +real 0m5.115s +user 0m5.005s +sys 0m0.069s Code generation completed in 5 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 664aaed428..461742ce96 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.124 s +1 processes with 6 diagrams generated in 0.122 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -593,18 +593,18 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s -Wrote files for 16 helas calls in 0.112 s +Wrote files for 16 helas calls in 0.110 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.183 s +ALOHA: aloha creates 3 routines in 0.181 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.183 s +ALOHA: aloha creates 6 routines in 0.178 s VVV1 VSS1 VSS1 @@ -626,7 +626,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -647,9 +646,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.986s -user 0m2.676s -sys 0m0.299s +real 0m2.946s +user 0m2.636s +sys 0m0.305s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index bc0d525a89..51006a93e0 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.123 s +1 processes with 6 diagrams generated in 0.121 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Load PLUGIN.CUDACPP_OUTPUT @@ -577,13 +577,13 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.181 s +ALOHA: aloha creates 3 routines in 0.183 s VVV1 VSS1 VSS1 @@ -599,7 +599,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.313s -user 0m1.239s -sys 0m0.063s -Code generation completed in 2 seconds +real 0m1.307s +user 0m1.235s +sys 0m0.065s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 9969a68175..d7a6d462a0 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.126 s +1 processes with 3 diagrams generated in 0.117 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1151]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,17 +592,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.107 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.108 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.135 s +ALOHA: aloha creates 2 routines in 0.137 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.131 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -623,7 +623,6 @@ INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file Source/DHELAS/aloha_functions.f patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile @@ -640,9 +639,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m4.141s -user 0m2.577s -sys 0m0.292s +real 0m4.205s +user 0m2.543s +sys 0m0.309s Code generation completed in 4 seconds ************************************************************ * * diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f index d0ec1dbde9..975725737f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/DHELAS/aloha_functions.f @@ -1201,7 +1201,7 @@ subroutine rotxxx(p,q , prot) c implicit none double precision p(0:3),q(0:3),prot(0:3),qt2,qt,psgn,qq,p1 - volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE #855) + volatile qt, p1, qq ! prevent optimizations with -O3 (workaround for SIGFPE crashes in rotxxx: madgraph5/madgraph4gpu#855) double precision rZero, rOne parameter( rZero = 0.0d0, rOne = 1.0d0 ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 57c49445c0..733f73bb02 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -59,9 +59,6 @@ set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.9004595279693604  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -557,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.109 s +1 processes with 3 diagrams generated in 0.128 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -585,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.136 s +ALOHA: aloha creates 2 routines in 0.137 s VVV1 FFV1 FFV1 @@ -600,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m2.351s -user 0m2.265s -sys 0m0.070s -Code generation completed in 3 seconds +real 0m1.281s +user 0m1.210s +sys 0m0.060s +Code generation completed in 2 seconds From 97af46fd25a4d9c5f8aabe6fabf2942963231177 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 16:54:40 +0200 Subject: [PATCH 31/33] [tmad] in .github/workflows/testsuite_oneprocess.sh, use iconfig=104 in gg_ttgg tmad tests (LHE color mismatch #856?) --- .github/workflows/testsuite_oneprocess.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh index 2822966d14..fe06327c6f 100755 --- a/.github/workflows/testsuite_oneprocess.sh +++ b/.github/workflows/testsuite_oneprocess.sh @@ -272,6 +272,7 @@ function getinputfile() nevt=$(getnevt) tmp=$tmpdir/input_${proc%.mad}_${backend} iconfig=1 + if [ "${proc%.mad}" == "gg_ttgg" ]; then iconfig=104; fi # test iconfig=104 on gg_ttgg (LHE color mismatch #856?) cat << EOF >> ${tmp} ${nevt} 1 1 ! Number of events and max and min iterations 0.000001 ! Accuracy (ignored because max iterations = min iterations) From f535c494d3773abd4c745cfd82bd35daf7cd8a1e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 18:17:45 +0200 Subject: [PATCH 32/33] [tmad] in .github/workflows/testsuite_oneprocess.sh, update the list of known issues (e.g. remove rotxxx crashes): enable bypasses, tests should succeed --- .github/workflows/testsuite_oneprocess.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh index fe06327c6f..17066ef943 100755 --- a/.github/workflows/testsuite_oneprocess.sh +++ b/.github/workflows/testsuite_oneprocess.sh @@ -534,12 +534,12 @@ function bypassIssue(){ if [ $BYPASS_KNOWN_ISSUES -eq 1 ] && [ $status -ne 0 ]; then # Known issues in tmad_test if [ "$stage" == "tmad_test" ]; then - # No cross section for susy_gg_t1t1 (#826) - ###if [ "${proc%.mad}" == "susy_gg_t1t1" ]; then bypassIssue "No cross section in ${proc%.mad} for FPTYPE=d,f,m (#826)"; fi - # SIGFPE crashes in rotxxx (#855) - ###if [ "${proc%.mad}" == "gq_ttq" ]; then bypassIssue "SIGFPE crash in rotxxx in ${proc%.mad} for FPTYPE=d,f,m (#855)"; fi - ###if [ "${proc%.mad}" == "pp_tt012j" ]; then bypassIssue "SIGFPE crash in rotxxx in ${proc%.mad} for FPTYPE=d,f,m (#855)"; fi - ###if [ "${proc%.mad}" == "nobm_pp_ttW" ]; then bypassIssue "#SIGFPE crash in rotxxx in ${proc%.mad} for FPTYPE=d,f,m (#855)"; fi + # No cross section in susy_gg_t1t1 (#826) + if [ "${proc%.mad}" == "susy_gg_t1t1" ]; then bypassIssue "No cross section in ${proc%.mad} for FPTYPE=d,f,m (#826)"; fi + # LHE color mismatch in gg_ttgg for iconfig=104 (#856) + if [ "${proc%.mad}" == "gg_ttgg" ]; then bypassIssue "LHE color mismatch for iconfig=104 in ${proc%.mad} for FPTYPE=d,f,m (#856)"; fi + # Cross section mismatch in pp_tt012j for P2_gu_ttxgu (#872) + if [ "${proc%.mad}" == "pp_tt012j" ]; then bypassIssue "Cross section mismatch for P2_gu_ttxgu in ${proc%.mad} for FPTYPE=d,f,m (#856)"; fi # Final printout if [ $status -ne 0 ]; then echo "[testsuite_oneprocess.sh] $stage ($proc) FPTYPE=${FPTYPE}: issue will not be bypassed, test has FAILED"; fi fi From 30cda683cd13c4835a6b6f5abe772b93eb466f70 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 27 Jun 2024 18:29:01 +0200 Subject: [PATCH 33/33] [tmad] ** COMPLETE TMAD (fixes for rotxxx crashes) ** disable bypassing of tmad known issues, the CI will fail, signaling these issues are pending --- .github/workflows/testsuite_oneprocess.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh index 17066ef943..a0051dda9e 100755 --- a/.github/workflows/testsuite_oneprocess.sh +++ b/.github/workflows/testsuite_oneprocess.sh @@ -535,11 +535,11 @@ if [ $BYPASS_KNOWN_ISSUES -eq 1 ] && [ $status -ne 0 ]; then # Known issues in tmad_test if [ "$stage" == "tmad_test" ]; then # No cross section in susy_gg_t1t1 (#826) - if [ "${proc%.mad}" == "susy_gg_t1t1" ]; then bypassIssue "No cross section in ${proc%.mad} for FPTYPE=d,f,m (#826)"; fi + ###if [ "${proc%.mad}" == "susy_gg_t1t1" ]; then bypassIssue "No cross section in ${proc%.mad} for FPTYPE=d,f,m (#826)"; fi # LHE color mismatch in gg_ttgg for iconfig=104 (#856) - if [ "${proc%.mad}" == "gg_ttgg" ]; then bypassIssue "LHE color mismatch for iconfig=104 in ${proc%.mad} for FPTYPE=d,f,m (#856)"; fi + ###if [ "${proc%.mad}" == "gg_ttgg" ]; then bypassIssue "LHE color mismatch for iconfig=104 in ${proc%.mad} for FPTYPE=d,f,m (#856)"; fi # Cross section mismatch in pp_tt012j for P2_gu_ttxgu (#872) - if [ "${proc%.mad}" == "pp_tt012j" ]; then bypassIssue "Cross section mismatch for P2_gu_ttxgu in ${proc%.mad} for FPTYPE=d,f,m (#856)"; fi + ###if [ "${proc%.mad}" == "pp_tt012j" ]; then bypassIssue "Cross section mismatch for P2_gu_ttxgu in ${proc%.mad} for FPTYPE=d,f,m (#856)"; fi # Final printout if [ $status -ne 0 ]; then echo "[testsuite_oneprocess.sh] $stage ($proc) FPTYPE=${FPTYPE}: issue will not be bypassed, test has FAILED"; fi fi