diff --git a/config/aws_citc.py b/config/aws_citc.py index 1141291d..6bd44062 100644 --- a/config/aws_citc.py +++ b/config/aws_citc.py @@ -1,15 +1,14 @@ -# This is an example configuration file +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository -# Note that CPU autodetect currently does not work with this configuration file on AWS. -# This is because there is no system mpirun, and the CPU autodetection doesn't load any modules -# that would make an mpirun command available (as normal multiprocessing tests would). -# In order to do CPU autodetection, you'll need to change the launcer to srun: -# 'launcher = srun' -# You can run the CPU autodetect by listing all tests (reframe -l ...) -# and then, once all CPUs are autodetected, change the launcher back to mpirun for a 'real' run (reframe -r ...) +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job -# Another known issue is that CPU autodetection fails if run from an actual installation of ReFrame. -# It only works if run from a clone of their Github Repo. See https://github.com/reframe-hpc/reframe/issues/2914 +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 import os @@ -119,6 +118,8 @@ 'logging': common_logging_config(reframe_prefix), 'general': [ { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information 'remote_detect': True, } ], @@ -127,12 +128,6 @@ # Add default things to each partition: partition_defaults = { 'scheduler': 'squeue', - # mpirun causes problems with cpu autodetect, since there is no system mpirun. - # See https://github.com/EESSI/test-suite/pull/53#issuecomment-1590849226 - # and this feature request https://github.com/reframe-hpc/reframe/issues/2926 - # However, using srun requires either using pmix or proper pmi2 integration in the MPI library - # See https://github.com/EESSI/test-suite/pull/53#issuecomment-1598753968 - # Thus, we use mpirun for now, and manually swap to srun if we want to autodetect CPUs... 'launcher': 'mpirun', 'environs': ['default'], 'features': [ diff --git a/config/izum_vega.py b/config/izum_vega.py index 51300d25..ca3e2179 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -1,3 +1,18 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# 3. Temporarily change the 'access' field for the GPU partition to +# 'access': ['-p gpu', '--export=None', '--gres=gpu:1'], + +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job +# 3. Vega doesn't allow submission to the GPU partition without requesting at least one GPU (change #2) + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 + import os from eessi.testsuite.common_config import common_logging_config @@ -9,13 +24,6 @@ # This is an example configuration file site_configuration = { - 'general': [ - { - # Enable automatic detection of CPU architecture for each partition - # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information - 'remote_detect': True, - } - ], 'systems': [ { 'name': 'vega', @@ -37,7 +45,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection + 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p cpu', '--export=None'], 'environs': ['default'], @@ -60,7 +68,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection + 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p gpu', '--export=None'], 'environs': ['default'], @@ -94,4 +102,11 @@ }, ], 'logging': common_logging_config(reframe_prefix), + 'general': [ + { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, + } + ], } diff --git a/config/settings_example.py b/config/settings_example.py index af910605..219efbcd 100644 --- a/config/settings_example.py +++ b/config/settings_example.py @@ -1,3 +1,19 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# If your system has a GPU partition, it might force jobs to request at least one GPU. If that is the +# case, you also need to temporarily change 'access' field for the GPU partition to include the request +# for one GPU, e.g. 'access': ['-p gpu', '--export=None', '--gres=gpu:1'], + +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 + + """ Example configuration file """ @@ -79,6 +95,13 @@ }, ], 'logging': common_logging_config(), + 'general': [ + { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, + } + ], } # optional logging to syslog diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 72aad234..d15d2a6d 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -1,3 +1,18 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# 3. Temporarily change the 'access' field for the GPU partition to +# 'access': ['-p gpu', '--export=None', '--exclusive'], + +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job +# 3. Snellius doesn't allow submission to the GPU partition without requesting at least one GPU + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 + import os from eessi.testsuite.common_config import common_logging_config @@ -19,18 +34,32 @@ 'stagedir': f'/scratch-shared/{os.environ.get("USER")}/reframe_output/staging', 'partitions': [ { - 'name': 'thin', + 'name': 'rome', + 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'launcher': 'mpirun', + 'access': ['-p rome', '--export=None'], + 'environs': ['default'], + 'max_jobs': 120, + 'features': [ + FEATURES[CPU], + ], + 'descr': 'AMD Rome CPU partition with native EESSI stack' + }, + { + 'name': 'genoa', 'scheduler': 'slurm', 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'launcher': 'mpirun', - 'access': ['-p thin', '--export=None'], + 'access': ['-p genoa', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'features': [ FEATURES[CPU], ], - 'descr': 'Test CPU partition with native EESSI stack' + 'descr': 'AMD Genoa CPU partition with native EESSI stack' }, + { 'name': 'gpu', 'scheduler': 'slurm', @@ -57,7 +86,7 @@ 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], }, - 'descr': 'Test GPU partition with native EESSI stack' + 'descr': 'Nvidia A100 GPU partition with native EESSI stack' }, ] }, @@ -73,9 +102,8 @@ 'logging': common_logging_config(reframe_prefix), 'general': [ { - # For autodetect to work, temporarily change: - # 1. The launchers to srun - # 2. Add --exclusive to GPU 'access' field above (avoids submission error that no GPUs are requested) + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information 'remote_detect': True, } ],