From 46d6b2bd6372128c8780cb53c1b9a0c02a26100b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Sep 2023 12:32:43 +0200 Subject: [PATCH 1/7] Added new genoa partition. Renamed thin to rome, as the thin partition will be deprecated --- config/surf_snellius.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 72aad234..71c55a3c 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -19,18 +19,32 @@ 'stagedir': f'/scratch-shared/{os.environ.get("USER")}/reframe_output/staging', 'partitions': [ { - 'name': 'thin', + 'name': 'rome', 'scheduler': 'slurm', 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'launcher': 'mpirun', - 'access': ['-p thin', '--export=None'], + 'access': ['-p rome', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'features': [ FEATURES[CPU], ], - 'descr': 'Test CPU partition with native EESSI stack' + 'descr': 'AMD Rome CPU partition with native EESSI stack' }, + { + 'name': 'genoa', + 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'launcher': 'mpirun', + 'access': ['-p genoa', '--export=None'], + 'environs': ['default'], + 'max_jobs': 120, + 'features': [ + FEATURES[CPU], + ], + 'descr': 'AMD Genoa CPU partition with native EESSI stack' + }, + { 'name': 'gpu', 'scheduler': 'slurm', @@ -57,7 +71,7 @@ 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], }, - 'descr': 'Test GPU partition with native EESSI stack' + 'descr': 'Nvidia A100 GPU partition with native EESSI stack' }, ] }, From 194dbfc462d57c791c68616ffa8a39c76f165cda Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Sep 2023 12:40:56 +0200 Subject: [PATCH 2/7] Added comment about autodetect for ReFrame 4.3.3, since a change of the launcher is not needed there anymore. It doesn't use a launcher anymore, just executes locally, since it's serial code anyway --- config/surf_snellius.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 71c55a3c..917f8e0f 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -88,7 +88,7 @@ 'general': [ { # For autodetect to work, temporarily change: - # 1. The launchers to srun + # 1. The launchers to srun (or use ReFrame >= 4.3.3) # 2. Add --exclusive to GPU 'access' field above (avoids submission error that no GPUs are requested) 'remote_detect': True, } From 76f6d4daf42f1d831a85a031e68d008f50359ce0 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Sep 2023 13:09:10 +0200 Subject: [PATCH 3/7] Changed comment for CPU autodetect. I'll move this up for other configs as well --- config/surf_snellius.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 917f8e0f..1059b9a5 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -1,3 +1,12 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 _or_ temporarily change the 'launcher' for each partition to srun +# 2. Temporarily change the 'access' field for the GPU partition to +# 'access': ['-p gpu', '--export=None', '--exclusive'], + +# Without this, the autodetect job fails because +# a missing mpirun command (change #1) +# Snellius doesn't allow submission to the GPU partition without requesting at least one GPU (change #2) + import os from eessi.testsuite.common_config import common_logging_config @@ -87,9 +96,8 @@ 'logging': common_logging_config(reframe_prefix), 'general': [ { - # For autodetect to work, temporarily change: - # 1. The launchers to srun (or use ReFrame >= 4.3.3) - # 2. Add --exclusive to GPU 'access' field above (avoids submission error that no GPUs are requested) + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information 'remote_detect': True, } ], From 2f411e9c3d3c4bc228f55039eb5685c4a6e1af26 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Sep 2023 13:13:25 +0200 Subject: [PATCH 4/7] Changed comments for CPU autodetection in Vega --- config/izum_vega.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 51300d25..500bff09 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -1,3 +1,12 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 _or_ temporarily change the 'launcher' for each partition to srun +# 2. Temporarily change the 'access' field for the GPU partition to +# 'access': ['-p gpu', '--export=None', '--gres=gpu:1'], + +# Without this, the autodetect job fails because +# a missing mpirun command (change #1) +# Vega doesn't allow submission to the GPU partition without requesting at least one GPU (change #2) + import os from eessi.testsuite.common_config import common_logging_config @@ -37,7 +46,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection + 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p cpu', '--export=None'], 'environs': ['default'], @@ -60,7 +69,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection + 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p gpu', '--export=None'], 'environs': ['default'], From 44b815b3bb6b4f0e0f642d52215229c83becbdcc Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Sep 2023 13:24:20 +0200 Subject: [PATCH 5/7] Added required changes for CPU autodetection --- config/aws_citc.py | 19 +++++++++---------- config/izum_vega.py | 14 ++++++++++---- config/surf_snellius.py | 14 ++++++++++---- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/config/aws_citc.py b/config/aws_citc.py index 1141291d..7b411fd5 100644 --- a/config/aws_citc.py +++ b/config/aws_citc.py @@ -1,15 +1,14 @@ -# This is an example configuration file +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository -# Note that CPU autodetect currently does not work with this configuration file on AWS. -# This is because there is no system mpirun, and the CPU autodetection doesn't load any modules -# that would make an mpirun command available (as normal multiprocessing tests would). -# In order to do CPU autodetection, you'll need to change the launcer to srun: -# 'launcher = srun' -# You can run the CPU autodetect by listing all tests (reframe -l ...) -# and then, once all CPUs are autodetected, change the launcher back to mpirun for a 'real' run (reframe -r ...) +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job -# Another known issue is that CPU autodetection fails if run from an actual installation of ReFrame. -# It only works if run from a clone of their Github Repo. See https://github.com/reframe-hpc/reframe/issues/2914 +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 import os diff --git a/config/izum_vega.py b/config/izum_vega.py index 500bff09..1fb9ecd5 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -1,11 +1,17 @@ # WARNING: for CPU autodetect to work correctly you need to -# 1. Either use ReFrame >= 4.3.3 _or_ temporarily change the 'launcher' for each partition to srun -# 2. Temporarily change the 'access' field for the GPU partition to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# 3. Temporarily change the 'access' field for the GPU partition to # 'access': ['-p gpu', '--export=None', '--gres=gpu:1'], # Without this, the autodetect job fails because -# a missing mpirun command (change #1) -# Vega doesn't allow submission to the GPU partition without requesting at least one GPU (change #2) +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job +# 3. Vega doesn't allow submission to the GPU partition without requesting at least one GPU (change #2) + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 import os diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 1059b9a5..d15d2a6d 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -1,11 +1,17 @@ # WARNING: for CPU autodetect to work correctly you need to -# 1. Either use ReFrame >= 4.3.3 _or_ temporarily change the 'launcher' for each partition to srun -# 2. Temporarily change the 'access' field for the GPU partition to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# 3. Temporarily change the 'access' field for the GPU partition to # 'access': ['-p gpu', '--export=None', '--exclusive'], # Without this, the autodetect job fails because -# a missing mpirun command (change #1) -# Snellius doesn't allow submission to the GPU partition without requesting at least one GPU (change #2) +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job +# 3. Snellius doesn't allow submission to the GPU partition without requesting at least one GPU + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 import os From 08eff9a0afe14a00c084326de5b4c391eaa2f9d1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Sep 2023 13:26:08 +0200 Subject: [PATCH 6/7] Move the autodetect setting to the top --- config/surf_snellius.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/config/surf_snellius.py b/config/surf_snellius.py index d15d2a6d..966807a8 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -24,6 +24,13 @@ # This is an example configuration file site_configuration = { + 'general': [ + { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, + } + ], 'systems': [ { 'name': 'snellius', @@ -100,11 +107,4 @@ }, ], 'logging': common_logging_config(reframe_prefix), - 'general': [ - { - # Enable automatic detection of CPU architecture for each partition - # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information - 'remote_detect': True, - } - ], } From 3bcee8477a61bbd3de796dbd59a77f6b4d247f40 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Sep 2023 13:31:56 +0200 Subject: [PATCH 7/7] Added comment on CPU autodetect to settings_example. Moved all autodetect settings to the end of the config --- config/aws_citc.py | 8 ++------ config/izum_vega.py | 14 +++++++------- config/settings_example.py | 23 +++++++++++++++++++++++ config/surf_snellius.py | 14 +++++++------- 4 files changed, 39 insertions(+), 20 deletions(-) diff --git a/config/aws_citc.py b/config/aws_citc.py index 7b411fd5..6bd44062 100644 --- a/config/aws_citc.py +++ b/config/aws_citc.py @@ -118,6 +118,8 @@ 'logging': common_logging_config(reframe_prefix), 'general': [ { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information 'remote_detect': True, } ], @@ -126,12 +128,6 @@ # Add default things to each partition: partition_defaults = { 'scheduler': 'squeue', - # mpirun causes problems with cpu autodetect, since there is no system mpirun. - # See https://github.com/EESSI/test-suite/pull/53#issuecomment-1590849226 - # and this feature request https://github.com/reframe-hpc/reframe/issues/2926 - # However, using srun requires either using pmix or proper pmi2 integration in the MPI library - # See https://github.com/EESSI/test-suite/pull/53#issuecomment-1598753968 - # Thus, we use mpirun for now, and manually swap to srun if we want to autodetect CPUs... 'launcher': 'mpirun', 'environs': ['default'], 'features': [ diff --git a/config/izum_vega.py b/config/izum_vega.py index 1fb9ecd5..ca3e2179 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -24,13 +24,6 @@ # This is an example configuration file site_configuration = { - 'general': [ - { - # Enable automatic detection of CPU architecture for each partition - # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information - 'remote_detect': True, - } - ], 'systems': [ { 'name': 'vega', @@ -109,4 +102,11 @@ }, ], 'logging': common_logging_config(reframe_prefix), + 'general': [ + { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, + } + ], } diff --git a/config/settings_example.py b/config/settings_example.py index af910605..219efbcd 100644 --- a/config/settings_example.py +++ b/config/settings_example.py @@ -1,3 +1,19 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# If your system has a GPU partition, it might force jobs to request at least one GPU. If that is the +# case, you also need to temporarily change 'access' field for the GPU partition to include the request +# for one GPU, e.g. 'access': ['-p gpu', '--export=None', '--gres=gpu:1'], + +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 + + """ Example configuration file """ @@ -79,6 +95,13 @@ }, ], 'logging': common_logging_config(), + 'general': [ + { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, + } + ], } # optional logging to syslog diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 966807a8..d15d2a6d 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -24,13 +24,6 @@ # This is an example configuration file site_configuration = { - 'general': [ - { - # Enable automatic detection of CPU architecture for each partition - # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information - 'remote_detect': True, - } - ], 'systems': [ { 'name': 'snellius', @@ -107,4 +100,11 @@ }, ], 'logging': common_logging_config(reframe_prefix), + 'general': [ + { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, + } + ], }