From 0e307acdab5e0ea5c8119fb1b5c429f7d271125d Mon Sep 17 00:00:00 2001 From: James Dinan Date: Thu, 14 Jan 2016 13:23:20 -0500 Subject: [PATCH 1/2] Fix races in shmem_lock_set/clear Correct races in set_lock and clear_lock that occurred in the sequence (1) check if lock object was updated (2) else wait for update. In the second step, the lock object was re-read rather than using the value read in the first step. As a result, if an update arrived between steps, shmem_lock_clear could wait using the new value as the wait condition, resulting in a deadlock. This should close issue #77. Signed-off-by: James Dinan --- src/shmem_lock.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/shmem_lock.h b/src/shmem_lock.h index 94644537a..676571d31 100644 --- a/src/shmem_lock.h +++ b/src/shmem_lock.h @@ -44,12 +44,19 @@ shmem_internal_clear_lock(long *lockp) cond = shmem_internal_my_pe + 1; shmem_internal_cswap(&(lock->last), &zero, &curr, &cond, sizeof(int), 0, DTYPE_INT); shmem_internal_get_wait(); + /* if local PE was not the last to hold the lock, have to look for the next in line */ if (curr != shmem_internal_my_pe + 1) { /* wait for next part of the data block to be non-zero */ - while (NEXT(lock->data) == 0) { - shmem_int_wait(&(lock->data), SIGNAL(lock->data)); + for (;;) { + lock_t lock_cur = *lock; + + if (NEXT(lock_cur.data) != 0) + break; + + shmem_int_wait(&(lock->data), lock_cur.data); } + /* set the signal bit on new lock holder */ shmem_internal_mswap(&(lock->data), &sig, &curr, &sig, sizeof(int), NEXT(lock->data) - 1, DTYPE_INT); shmem_internal_get_wait(); @@ -75,8 +82,13 @@ shmem_internal_set_lock(long *lockp) shmem_internal_mswap(&(lock->data), &me, &curr, &next_mask, sizeof(int), curr - 1, DTYPE_INT); shmem_internal_get_wait(); /* now wait for the signal part of data to be non-zero */ - while (SIGNAL(lock->data) == 0) { - shmem_int_wait(&(lock->data), NEXT(lock->data)); + for (;;) { + lock_t lock_cur = *lock; + + if (SIGNAL(lock_cur.data) != 0) + break; + + shmem_int_wait(&(lock->data), lock_cur.data); } } } From ee3e5335095ba0121bf45c87a94b795a14ee5ef7 Mon Sep 17 00:00:00 2001 From: James Dinan Date: Fri, 15 Jan 2016 11:37:47 -0500 Subject: [PATCH 2/2] Add cswap spinlock test case This case exercises a bug that we are trying to track down. Signed-off-by: James Dinan --- test/unit/Makefile.am | 4 ++ test/unit/test_lock_cswap.c | 111 ++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 test/unit/test_lock_cswap.c diff --git a/test/unit/Makefile.am b/test/unit/Makefile.am index cddf84695..b3feacd21 100644 --- a/test/unit/Makefile.am +++ b/test/unit/Makefile.am @@ -50,6 +50,7 @@ TESTS = \ atomic_inc \ set_lock \ test_lock \ + test_lock_cswap \ fcollect64 \ bigput \ bigget \ @@ -193,6 +194,9 @@ set_lock_LDADD = $(top_builddir)/src/libsma.la test_lock_SOURCES = test_lock.c test_lock_LDADD = $(top_builddir)/src/libsma.la +test_lock_cswap_SOURCES = test_lock_cswap.c +test_lock_cswap_LDADD = $(top_builddir)/src/libsma.la + fcollect64_SOURCES = fcollect64.c fcollect64_LDADD = $(top_builddir)/src/libsma.la diff --git a/test/unit/test_lock_cswap.c b/test/unit/test_lock_cswap.c new file mode 100644 index 000000000..ef6b3d0aa --- /dev/null +++ b/test/unit/test_lock_cswap.c @@ -0,0 +1,111 @@ +/* + * shmem_test_lock_cswap() test_lock {-v|n} + * where: + * -v Enable debugging messages + * -n x Period with which to announce spin count + * + * For n loops: + * Each PE repeatedly attempts to take a simple spinlock on rank 0 using + * cswap, upon success the lock is released and the PE enters a barrier. + * On a failed lock attempt, increment local lock_tries counter and repeat. + */ + +#include +#include +#include +#include +#include + +#define Rfprintf if (shmem_my_pe() == 0) fprintf +#define Rprintf if (shmem_my_pe() == 0) printf +#define Vfprintf if (Verbose) fprintf +#define Vprintf if (Verbose) printf + +int Verbose = 0; +int Announce = 0; +int Noise = 500; +int Loops = 40; + +long lock; + +int +main(int argc, char* argv[]) +{ + int c, cloop; + int my_rank, num_ranks; + + shmem_init(); + my_rank = shmem_my_pe(); + num_ranks = shmem_n_pes(); + + if (num_ranks == 1) { + fprintf(stderr, "ERR - Requires > 1 PEs\n"); + shmem_finalize(); + return 0; + } + + while ((c = getopt(argc,argv,"n:v")) != -1) { + switch (c) { + case 'n': + Noise = atoi(optarg); + break; + case 'v': + Verbose++; + Announce = 1; + break; + default: + Rfprintf(stderr,"ERR - unknown -%c ?\n",c); + shmem_finalize(); + return 1; + } + } + + for (cloop=1; cloop <= Loops; cloop++) { + int got_lock = 0; + int lock_cnt = 0; + int tries = 0; + + lock = 0; + + shmem_barrier_all(); /* sync all ranks */ + + while (!got_lock) { + long lockval = shmem_long_cswap(&lock, 0, my_rank+1, 0); + + if (lockval == 0) { + long unlockval; + got_lock = 1; + + Vprintf("[%d] locked: lock_cnt(%d) lock(%lx)\n", my_rank, lock_cnt, lock); + + unlockval = shmem_long_cswap(&lock, my_rank+1, 0, 0); /* RACE: PE 1 hangs here */ + if (unlockval != my_rank+1) { + printf("[%d] unlock failed, expected %lx got %lx\n", my_rank, (long) my_rank+1, unlockval); + shmem_global_exit(1); + } + + Vprintf("[%d] finished unlock\n", my_rank); + } + else { + tries++; + if ( Announce && ((tries % (num_ranks*Noise)) == 0) ) { + printf("[%d] unsuccessful lock attempts %d lock_cnt %d lock %lx\n", + my_rank, tries, lock_cnt, lock); + } + } + } + shmem_barrier_all(); /* sync all ranks */ + + if ((cloop % 10) == 0) { + if (Announce) { + Rprintf("%d ranks completed %d loops\n", num_ranks, cloop); + } + } + } + + shmem_barrier_all(); /* sync all ranks */ + + Vprintf ("[%d] of %d, Exit\n", my_rank, num_ranks); + shmem_finalize(); + return 0; +}