Skip to content

Commit

Permalink
Merge pull request #84 from casparvl/snellius_genoa
Browse files Browse the repository at this point in the history
Added new genoa partition
  • Loading branch information
smoors authored Sep 19, 2023
2 parents b1bc87d + 3bcee84 commit 4f65113
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 32 deletions.
27 changes: 11 additions & 16 deletions config/aws_citc.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# This is an example configuration file
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository

# Note that CPU autodetect currently does not work with this configuration file on AWS.
# This is because there is no system mpirun, and the CPU autodetection doesn't load any modules
# that would make an mpirun command available (as normal multiprocessing tests would).
# In order to do CPU autodetection, you'll need to change the launcer to srun:
# 'launcher = srun'
# You can run the CPU autodetect by listing all tests (reframe -l ...)
# and then, once all CPUs are autodetected, change the launcher back to mpirun for a 'real' run (reframe -r ...)
# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job

# Another known issue is that CPU autodetection fails if run from an actual installation of ReFrame.
# It only works if run from a clone of their Github Repo. See https://github.com/reframe-hpc/reframe/issues/2914
# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914

import os

Expand Down Expand Up @@ -119,6 +118,8 @@
'logging': common_logging_config(reframe_prefix),
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
Expand All @@ -127,12 +128,6 @@
# Add default things to each partition:
partition_defaults = {
'scheduler': 'squeue',
# mpirun causes problems with cpu autodetect, since there is no system mpirun.
# See https://github.com/EESSI/test-suite/pull/53#issuecomment-1590849226
# and this feature request https://github.com/reframe-hpc/reframe/issues/2926
# However, using srun requires either using pmix or proper pmi2 integration in the MPI library
# See https://github.com/EESSI/test-suite/pull/53#issuecomment-1598753968
# Thus, we use mpirun for now, and manually swap to srun if we want to autodetect CPUs...
'launcher': 'mpirun',
'environs': ['default'],
'features': [
Expand Down
33 changes: 24 additions & 9 deletions config/izum_vega.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository
# 3. Temporarily change the 'access' field for the GPU partition to
# 'access': ['-p gpu', '--export=None', '--gres=gpu:1'],

# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job
# 3. Vega doesn't allow submission to the GPU partition without requesting at least one GPU (change #2)

# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914

import os

from eessi.testsuite.common_config import common_logging_config
Expand All @@ -9,13 +24,6 @@

# This is an example configuration file
site_configuration = {
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
'systems': [
{
'name': 'vega',
Expand All @@ -37,7 +45,7 @@
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection
'launcher': 'mpirun',
# Use --export=None to avoid that login environment is passed down to submitted jobs
'access': ['-p cpu', '--export=None'],
'environs': ['default'],
Expand All @@ -60,7 +68,7 @@
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection
'launcher': 'mpirun',
# Use --export=None to avoid that login environment is passed down to submitted jobs
'access': ['-p gpu', '--export=None'],
'environs': ['default'],
Expand Down Expand Up @@ -94,4 +102,11 @@
},
],
'logging': common_logging_config(reframe_prefix),
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
}
23 changes: 23 additions & 0 deletions config/settings_example.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository
# If your system has a GPU partition, it might force jobs to request at least one GPU. If that is the
# case, you also need to temporarily change 'access' field for the GPU partition to include the request
# for one GPU, e.g. 'access': ['-p gpu', '--export=None', '--gres=gpu:1'],

# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job

# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914


"""
Example configuration file
"""
Expand Down Expand Up @@ -79,6 +95,13 @@
},
],
'logging': common_logging_config(),
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
}

# optional logging to syslog
Expand Down
42 changes: 35 additions & 7 deletions config/surf_snellius.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository
# 3. Temporarily change the 'access' field for the GPU partition to
# 'access': ['-p gpu', '--export=None', '--exclusive'],

# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job
# 3. Snellius doesn't allow submission to the GPU partition without requesting at least one GPU

# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914

import os

from eessi.testsuite.common_config import common_logging_config
Expand All @@ -19,18 +34,32 @@
'stagedir': f'/scratch-shared/{os.environ.get("USER")}/reframe_output/staging',
'partitions': [
{
'name': 'thin',
'name': 'rome',
'scheduler': 'slurm',
'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'],
'launcher': 'mpirun',
'access': ['-p rome', '--export=None'],
'environs': ['default'],
'max_jobs': 120,
'features': [
FEATURES[CPU],
],
'descr': 'AMD Rome CPU partition with native EESSI stack'
},
{
'name': 'genoa',
'scheduler': 'slurm',
'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'],
'launcher': 'mpirun',
'access': ['-p thin', '--export=None'],
'access': ['-p genoa', '--export=None'],
'environs': ['default'],
'max_jobs': 120,
'features': [
FEATURES[CPU],
],
'descr': 'Test CPU partition with native EESSI stack'
'descr': 'AMD Genoa CPU partition with native EESSI stack'
},

{
'name': 'gpu',
'scheduler': 'slurm',
Expand All @@ -57,7 +86,7 @@
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
},
'descr': 'Test GPU partition with native EESSI stack'
'descr': 'Nvidia A100 GPU partition with native EESSI stack'
},
]
},
Expand All @@ -73,9 +102,8 @@
'logging': common_logging_config(reframe_prefix),
'general': [
{
# For autodetect to work, temporarily change:
# 1. The launchers to srun
# 2. Add --exclusive to GPU 'access' field above (avoids submission error that no GPUs are requested)
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
Expand Down

0 comments on commit 4f65113

Please sign in to comment.