Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix: use larger mempool parameters to avoid running out of pinnables #3810

Merged
merged 4 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ if (NOT CMAKE_BUILD_TYPE)
endif()

if (NOT CMK_MEMPOOL_CUTOFFNUM)
set(CMK_MEMPOOL_CUTOFFNUM 26)
set(CMK_MEMPOOL_CUTOFFNUM 28)
endif()

set(CMK_OPTIMIZE 0)
Expand Down
2 changes: 1 addition & 1 deletion src/arch/ofi-linux-x86_64/conv-mach.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
if test -z "$USER_OPTS_LD"
then
CMK_INCDIR="-I/usr/include/"
CMK_LIBDIR="-L/usr/lib64/"
# CMK_LIBDIR="-L/usr/lib64/"
fi

CMK_LIBS="$CMK_LIBS -lfabric"
Expand Down
53 changes: 21 additions & 32 deletions src/arch/ofi/machine.C
Original file line number Diff line number Diff line change
Expand Up @@ -200,29 +200,22 @@ static int _tlbpagesize = 4096;
// separate pool of memory mapped huge pages
static CmiInt8 BIG_MSG = 16 * ONE_MB;
#else
static CmiInt8 BIG_MSG = 2 * ONE_MB;
static CmiInt8 BIG_MSG = 16 * ONE_MB;
#endif

void* LrtsPoolAlloc(int n_bytes);

#include "mempool.h"
#if CMK_SMP
// nothing to do here
#else
//minimal per process memory pool use for nonsmp mode
#define USE_SMALL_BASE_POOL_DEFAULTS 1
#endif

#if USE_SMALL_BASE_POOL_DEFAULTS
#define MEMPOOL_INIT_SIZE_MB_DEFAULT 1
#define MEMPOOL_EXPAND_SIZE_MB_DEFAULT 4
#define MEMPOOL_MAX_SIZE_MB_DEFAULT 16
#define MEMPOOL_INIT_SIZE_MB_DEFAULT 64
#define MEMPOOL_EXPAND_SIZE_MB_DEFAULT 64
#define MEMPOOL_MAX_SIZE_MB_DEFAULT 512
#define MEMPOOL_LB_DEFAULT 0
#define MEMPOOL_RB_DEFAULT 32*ONE_MB
#define MEMPOOL_RB_DEFAULT 134217728
#else
#define MEMPOOL_INIT_SIZE_MB_DEFAULT 4
#define MEMPOOL_EXPAND_SIZE_MB_DEFAULT 16
#define MEMPOOL_MAX_SIZE_MB_DEFAULT 512
#define MEMPOOL_INIT_SIZE_MB_DEFAULT 128
#define MEMPOOL_EXPAND_SIZE_MB_DEFAULT 128
#define MEMPOOL_MAX_SIZE_MB_DEFAULT 256
#define MEMPOOL_LB_DEFAULT 0
#define MEMPOOL_RB_DEFAULT 134217728
#endif
Expand All @@ -239,7 +232,7 @@ void* LrtsPoolAlloc(int n_bytes);
#define GetBaseAllocPtr(x) GetMempoolBlockPtr(x)
#define GetMemOffsetFromBase(x) ((char*)(x) - (char *) GetBaseAllocPtr(x))


void* LrtsPoolAlloc(int n_bytes);

CpvDeclare(mempool_type*, mempool);
#else
Expand All @@ -250,25 +243,15 @@ CpvDeclare(mempool_type*, mempool);
#define CmiGetMsgSize(msg) ((((CmiMsgHeaderBasic *)msg)->size))

#define CACHELINE_LEN 64
#if CMK_SMP
#define OFI_NUM_RECV_REQS_DEFAULT 8

#define OFI_NUM_RECV_REQS_DEFAULT 16
#define OFI_NUM_RECV_REQS_MAX 4096

#define OFI_EAGER_MAXSIZE_DEFAULT 65536
#define OFI_EAGER_MAXSIZE_MAX 1048576

#define OFI_CQ_ENTRIES_COUNT_DEFAULT 8
#define OFI_CQ_ENTRIES_COUNT_MAX 1024
#else
#define OFI_NUM_RECV_REQS_DEFAULT 4
#define OFI_NUM_RECV_REQS_MAX 64

#define OFI_EAGER_MAXSIZE_DEFAULT 65536
#define OFI_EAGER_MAXSIZE_MAX 1048576

#define OFI_CQ_ENTRIES_COUNT_DEFAULT 4
#define OFI_CQ_ENTRIES_COUNT_MAX 64
#endif

#define OFI_USE_INJECT_DEFAULT 1

Expand Down Expand Up @@ -924,6 +907,7 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
#endif //verbose

#if CMK_CXI
OFI_INFO("OFI CXI extensions enabled\n");
if ((context.mr_mode & FI_MR_ENDPOINT)==0)
CmiAbort("OFI::LrtsInit::Unsupported MR mode FI_MR_ENDPOINT");
#else
Expand Down Expand Up @@ -2023,7 +2007,9 @@ void *alloc_mempool_block(size_t *size, mem_handle_t *mem_hndl, int expand_flag)

void free_mempool_block(void *ptr, mem_handle_t mem_hndl)
{
MACHSTATE3(3, "free_mempool_block ptr %p mr %p key %lu\n", ptr, mem_hndl, fi_mr_key(mem_hndl));
free(ptr);
fi_close( (struct fid *) mem_hndl);
}

#endif
Expand Down Expand Up @@ -2663,7 +2649,8 @@ static int ofi_reg_bind_enable(const void *buf,
if (ret) {
MACHSTATE1(3, "fi_mr_reg error: %d\n", ret);
char errstring[100];
snprintf(errstring, 100, "fi_mr_reg error: %d", ret);
const char* fi_errstring=fi_strerror(ret);
snprintf(errstring, 100, "fi_mr_reg error: %d %s", ret, fi_errstring);
CmiAbort(errstring);
}
else{
Expand All @@ -2674,7 +2661,8 @@ static int ofi_reg_bind_enable(const void *buf,
if (ret) {
MACHSTATE1(3, "fi_mr_bind error: %d\n", ret);
char errstring[100];
snprintf(errstring, 100, "fi_mr_bind error: %d", ret);
const char* fi_errstring=fi_strerror(ret);
snprintf(errstring, 100, "fi_mr_bind error: %d %s", ret,fi_errstring);
CmiAbort(errstring);
}
else
Expand All @@ -2685,8 +2673,9 @@ static int ofi_reg_bind_enable(const void *buf,
ret = fi_mr_enable(*mr);
if (ret) {
MACHSTATE1(3, "fi_mr_enable error: %d\n", ret);
char errstring[100];
snprintf(errstring, 100, "fi_mr_enable error: %d", ret);
char errstring[120];
const char* fi_errstring=fi_strerror(ret);
snprintf(errstring, 120, "[%d] fi_mr_enable error: %d handle %lu addr %p len 0x%lX %s", CmiMyPe(), ret,*mr, buf, len, fi_errstring);
CmiAbort(errstring);
}
else
Expand Down
2 changes: 1 addition & 1 deletion src/scripts/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -973,7 +973,7 @@ fi
AC_ARG_WITH([mempool-cutoff],
[AS_HELP_STRING([--with-mempool-cutoff=N],
[exponent of the maximum power of two to use for bin sizes in the mempool])],
[], [with_mempool_cutoff=26])
[], [with_mempool_cutoff=28])

if test "$((6 < $with_mempool_cutoff && $with_mempool_cutoff < 32))" = '1'
then
Expand Down
Loading