From 2a3ec5270d82aa09af923662389bc041fa7815cf Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Fri, 17 May 2024 14:16:00 -0700 Subject: [PATCH] src: Continue passing NIC index were needed --- src/collectives.c | 233 +++++++++++++++++++++------------------- src/collectives_c.c4 | 142 +++++++++++++++++------- src/init.c | 4 +- src/lock_c.c | 12 ++- src/shmem_collectives.h | 120 ++++++++++++--------- src/shmem_lock.h | 34 +++--- src/shmem_team.c | 30 +++--- src/shmem_team.h | 7 +- src/symmetric_heap_c.c | 30 ++++-- src/teams_c.c4 | 10 +- 10 files changed, 370 insertions(+), 252 deletions(-) diff --git a/src/collectives.c b/src/collectives.c index ee51f869e..7a277ebba 100644 --- a/src/collectives.c +++ b/src/collectives.c @@ -244,7 +244,8 @@ shmem_internal_collectives_init(void) * *****************************************/ void -shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long zero = 0, one = 1; @@ -259,27 +260,27 @@ shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down psync tree */ for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; i++, pe += PE_stride) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe, nic_idx); } } else { /* send message to root */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), PE_start, - SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack down psync tree */ SHMEM_WAIT(pSync, 0); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -287,7 +288,8 @@ shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync void -shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long zero = 0, one = 1; int parent, num_children, *children; @@ -318,13 +320,13 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down to children */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } else { @@ -332,20 +334,20 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* send ack to parent */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack from parent */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, num_children + 1); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down to children */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } @@ -354,21 +356,22 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* send message up psync tree */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), parent, - SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack down psync tree */ SHMEM_WAIT(pSync, 0); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } } void -shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int one = 1, neg_one = -1; int distance, to, i; @@ -389,7 +392,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync to = PE_start + (to * PE_stride); shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &one, sizeof(int), - to, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + to, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); SHMEM_WAIT_UNTIL(&pSync_ints[i], SHMEM_CMP_NE, 0); /* There's a path where the next update from a peer can get @@ -399,7 +402,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync /* this slot is no longer used, so subtract off results now */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &neg_one, sizeof(int), - shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); } /* Ensure local pSync decrements are done before a subsequent barrier */ @@ -415,7 +418,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync void shmem_internal_bcast_linear(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { long zero = 0, one = 1; int real_root = PE_start + PE_root * PE_stride; @@ -432,16 +435,16 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* send data to all peers */ for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; - shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion); + shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion, nic_idx); } - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion ack to all peers */ for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), pe, nic_idx); } if (1 == complete) { @@ -450,7 +453,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -460,13 +463,13 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); if (1 == complete) { /* send ack back to root */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - real_root, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + real_root, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } } @@ -475,7 +478,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, void shmem_internal_bcast_tree(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { long zero = 0, one = 1; long completion = 0; @@ -510,23 +513,23 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* if complete, send ack */ if (1 == complete) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } /* send data to all leaves */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, send_buf, len, children[i], - &completion); + &completion, nic_idx); } - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion ack to all peers */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), - children[i]); + children[i], nic_idx); } if (1 == complete) { @@ -539,7 +542,7 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { @@ -549,12 +552,12 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* if complete, send ack */ if (1 == complete) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } } @@ -569,7 +572,8 @@ void shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { long zero = 0, one = 1; @@ -586,22 +590,22 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, will flush any atomic cache value that may currently exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, - shmem_internal_my_pe, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_my_pe, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; i++, pe += PE_stride) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe, nic_idx); } /* Wait for others to acknowledge sending data */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, PE_size - 1); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { @@ -609,22 +613,22 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, SHMEM_WAIT(pSync, 0); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* send data, ack, and wait for completion */ shmem_internal_atomicv(SHMEM_CTX_DEFAULT, target, source, count * type_size, - PE_start, op, datatype, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + PE_start, op, datatype, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* broadcast out */ shmem_internal_bcast(target, target, count * type_size, 0, - PE_start, PE_stride, PE_size, pSync + 2, 0); + PE_start, PE_stride, PE_size, pSync + 2, 0, nic_idx); } @@ -635,7 +639,8 @@ void shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { int group_rank = (shmem_internal_my_pe - PE_start) / PE_stride; long zero = 0, one = 1; @@ -650,7 +655,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si if (PE_size == 1) { if (target != source) - shmem_internal_copy_self(target, source, count * type_size); + shmem_internal_copy_self(target, source, count * type_size, nic_idx); return; } @@ -662,11 +667,11 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si if (NULL == tmp) RAISE_ERROR_MSG("Unable to allocate %zub temporary buffer\n", count*type_size); - shmem_internal_copy_self(tmp, target, count * type_size); + shmem_internal_copy_self(tmp, target, count * type_size, nic_idx); free_source = 1; source = tmp; - shmem_internal_sync(PE_start, PE_stride, PE_size, pSync + 2); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync + 2, nic_idx); } /* Perform reduce-scatter: @@ -700,10 +705,10 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si i == 0 ? ((uint8_t *) source) + chunk_out_disp : ((uint8_t *) target) + chunk_out_disp, - chunk_out_count * type_size, peer); + chunk_out_count * type_size, peer, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* Wait for chunk */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_GE, i+1); @@ -714,7 +719,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si } /* Reset reduce-scatter pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Perform all-gather: @@ -733,17 +738,17 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, ((uint8_t *) target) + chunk_out_disp, ((uint8_t *) target) + chunk_out_disp, - chunk_out_count * type_size, peer); + chunk_out_count * type_size, peer, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync+1, &one, sizeof(one), - peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* Wait for chunk */ SHMEM_WAIT_UNTIL(pSync+1, SHMEM_CMP_GE, i+1); } /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync+1, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync+1, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync+1, SHMEM_CMP_EQ, 0); if (free_source) @@ -755,7 +760,8 @@ void shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { long zero = 0, one = 1; long completion = 0; @@ -766,7 +772,7 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si if (PE_size == 1) { if (target != source) { - shmem_internal_copy_self(target, source, type_size * count); + shmem_internal_copy_self(target, source, type_size * count, nic_idx); } return; } @@ -791,20 +797,20 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si will flush any atomic cache value that may currently exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, - shmem_internal_my_pe, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_my_pe, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ for (i = 0 ; i < num_children ; ++i) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &one, sizeof(one), children[i]); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &one, sizeof(one), children[i], nic_idx); } /* Wait for others to acknowledge sending data */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, num_children); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -813,24 +819,24 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si SHMEM_WAIT(pSync + 1, 0); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync + 1, SHMEM_CMP_EQ, 0); /* send data, ack, and wait for completion */ shmem_internal_atomicv(SHMEM_CTX_DEFAULT, target, (num_children == 0) ? source : target, count * type_size, parent, - op, datatype, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + op, datatype, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* broadcast out */ shmem_internal_bcast(target, target, count * type_size, 0, PE_start, - PE_stride, PE_size, pSync + 2, 0); + PE_stride, PE_size, pSync + 2, 0, nic_idx); } @@ -838,7 +844,8 @@ void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int log2_proc = 1, pow2_proc = 2; @@ -851,7 +858,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun if (PE_size == 1) { if (target != source) { - shmem_internal_copy_self(target, source, type_size * count); + shmem_internal_copy_self(target, source, type_size * count, nic_idx); } free(current_target); return; @@ -896,17 +903,17 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_target_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, - &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_data_ready); } else { if (my_id < PE_size - pow2_proc) { int peer = (my_id + pow2_proc) * PE_stride + PE_start; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_target_ready, sizeof(long), peer); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_target_ready, sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_data_ready); shmem_internal_reduce_local(op, datatype, count, target, current_target); @@ -922,25 +929,25 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun if (shmem_internal_my_pe < peer) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_target_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_data_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, - wrk_size, peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + wrk_size, peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); } else { SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_target_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, - wrk_size, peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + wrk_size, peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_data_ready); } @@ -954,11 +961,11 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun int peer = (my_id + pow2_proc) * PE_stride + PE_start; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, - peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); } memcpy(target, current_target, wrk_size); @@ -978,7 +985,8 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun *****************************************/ void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { size_t my_offset; long tmp[2]; @@ -991,7 +999,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, target, source, len, PE_start, PE_stride, PE_size, (void*) pSync); if (PE_size == 1) { - if (target != source) shmem_internal_copy_self(target, source, len); + if (target != source) shmem_internal_copy_self(target, source, len, nic_idx); return; } @@ -1000,7 +1008,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, my_offset = 0; tmp[0] = (long) len; /* FIXME: Potential truncation of size_t into long */ tmp[1] = 1; /* FIXME: Packing flag with data relies on byte ordering */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + PE_stride); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + PE_stride, nic_idx); } else { /* wait for send data */ @@ -1012,7 +1020,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, tmp[0] = (long) (my_offset + len); tmp[1] = 1; shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), - shmem_internal_my_pe + PE_stride); + shmem_internal_my_pe + PE_stride, nic_idx); } } @@ -1024,13 +1032,13 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, do { if (len > 0) { shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, ((uint8_t *) target) + my_offset, source, - len, peer); + len, peer, nic_idx); } peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, &pSync[2]); + shmem_internal_barrier(PE_start, PE_stride, PE_size, &pSync[2], nic_idx); pSync[0] = SHMEM_SYNC_VALUE; pSync[1] = SHMEM_SYNC_VALUE; @@ -1047,7 +1055,8 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, *****************************************/ void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long tmp = 1; long completion = 0; @@ -1057,36 +1066,36 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, if (PE_start == shmem_internal_my_pe) { /* Copy data into the target */ - if (source != target) shmem_internal_copy_self(target, source, len); + if (source != target) shmem_internal_copy_self(target, source, len, nic_idx); /* send completion update */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(long), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for N updates */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, PE_size); /* Clear pSync */ tmp = 0; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(tmp), PE_start); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(tmp), PE_start, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { /* Push data into the target */ size_t offset = ((shmem_internal_my_pe - PE_start) / PE_stride) * len; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + offset, source, len, PE_start, - &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); /* ensure ordering */ shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion update */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(long), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } shmem_internal_bcast(target, target, len * PE_size, 0, PE_start, PE_stride, - PE_size, pSync + 1, 0); + PE_size, pSync + 1, 0, nic_idx); } @@ -1099,7 +1108,8 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int i; /* my_id is the index in a theoretical 0...N-1 array of @@ -1115,7 +1125,7 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, if (len == 0) return; /* copy my portion to the right place */ - shmem_internal_copy_self((char*) target + (my_id * len), source, len); + shmem_internal_copy_self((char*) target + (my_id * len), source, len, nic_idx); /* send n - 1 messages to the next highest proc. Each message contains what we received the previous step (including our own @@ -1125,8 +1135,8 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, /* send data to me + 1 */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + iter_offset, (char*) target + iter_offset, - len, next_proc, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + len, next_proc, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion for this round to next proc. Note that we @@ -1134,14 +1144,14 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, between successive calls to the put above. So a rolling counter is safe here. */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), - next_proc, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + next_proc, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for completion for this round */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_GE, i); } /* zero out psync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(long), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(long), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -1155,7 +1165,8 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int i; @@ -1179,7 +1190,7 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, /* copy my portion to the right place */ curr_offset = my_id * len; - shmem_internal_copy_self((char*) target + curr_offset, source, len); + shmem_internal_copy_self((char*) target + curr_offset, source, len, nic_idx); for (i = 0, distance = 0x1 ; distance < PE_size ; i++, distance <<= 1) { int peer = my_id ^ distance; @@ -1187,19 +1198,19 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, /* send data to peer */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + curr_offset, (char*) target + curr_offset, - distance * len, real_peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + distance * len, real_peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* mark completion for this round */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &one, sizeof(int), - real_peer, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + real_peer, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); SHMEM_WAIT_UNTIL(&pSync_ints[i], SHMEM_CMP_NE, 0); /* this slot is no longer used, so subtract off results now */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &neg_one, sizeof(int), - shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); if (my_id > peer) { curr_offset -= (distance * len); @@ -1212,7 +1223,8 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_ptr = (uint8_t *) dest + my_as_rank * len; @@ -1232,12 +1244,12 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, int peer_as_rank = (peer - PE_start) / PE_stride; /* Peer's index in active set */ shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, (void *) dest_ptr, (uint8_t *) source + peer_as_rank * len, - len, peer); + len, peer, nic_idx); peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync, nic_idx); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; @@ -1247,7 +1259,8 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_base = (uint8_t *) dest + my_as_rank * nelems * dst * elem_size; @@ -1279,7 +1292,7 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, for (i = nelems ; i > 0; i--) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, (void *) dest_ptr, (uint8_t *) source_ptr, - elem_size, peer); + elem_size, peer, nic_idx); source_ptr += sst * elem_size; dest_ptr += dst * elem_size; @@ -1288,7 +1301,7 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync, nic_idx); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; diff --git a/src/collectives_c.c4 b/src/collectives_c.c4 index 70c8876b5..62c9c7ce5 100644 --- a/src/collectives_c.c4 +++ b/src/collectives_c.c4 @@ -158,7 +158,9 @@ shmem_barrier_all(void) { SHMEM_ERR_CHECK_INITIALIZED(); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); } @@ -169,7 +171,9 @@ shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_barrier(PE_start, 1 << logPE_stride, PE_size, pSync); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier(PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -178,7 +182,9 @@ shmem_sync_all(void) { SHMEM_ERR_CHECK_INITIALIZED(); - shmem_internal_sync_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_sync_all(nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -188,7 +194,9 @@ shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_sync(PE_start, 1 << logPE_stride, PE_size, pSync); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_sync(PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } /* Team-based Collective Routines */ @@ -199,9 +207,11 @@ shmem_team_sync(shmem_team_t team) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_TEAM_VALID(team); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, SYNC); - shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync); + long *psync = shmem_internal_team_choose_psync(myteam, SYNC, nic_idx); + shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, SYNC); return 0; } @@ -228,9 +238,11 @@ shmem_team_sync(shmem_team_t team) SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE)*nreduce, \ sizeof(TYPE)*nreduce, 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_op_to_all(target, source, nreduce, sizeof(TYPE), \ PE_start, 1 << logPE_stride, PE_size, \ - pWrk, pSync, IOP, ITYPE); \ + pWrk, pSync, IOP, ITYPE, nic_idx); \ } #define SHMEM_DEF_REDUCE(STYPE,TYPE,ITYPE,SOP,IOP) \ @@ -247,11 +259,14 @@ shmem_team_sync(shmem_team_t team) sizeof(TYPE)*nreduce, 1, 1); \ TYPE *pWrk = NULL; \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, REDUCE); \ + long *psync = shmem_internal_team_choose_psync(myteam, REDUCE, \ + nic_idx); \ shmem_internal_op_to_all(dest, source, nreduce, sizeof(TYPE), \ myteam->start, myteam->stride, myteam->size, pWrk, \ - psync, IOP, ITYPE); \ + psync, IOP, ITYPE, nic_idx); \ shmem_internal_team_release_psyncs(myteam, REDUCE); \ return 0; \ } @@ -292,9 +307,11 @@ shmem_broadcast32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_bcast(target, source, nlong * 4, PE_root, PE_start, 1 << logPE_stride, PE_size, - pSync, 1); + pSync, 1, nic_idx); } @@ -311,9 +328,11 @@ shmem_broadcast64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_bcast(target, source, nlong * 8, PE_root, PE_start, 1 << logPE_stride, PE_size, - pSync, 1); + pSync, 1, nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES @@ -327,15 +346,17 @@ shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, BCAST); + long *psync = shmem_internal_team_choose_psync(myteam, BCAST, nic_idx); shmem_internal_bcast(dest, source, nelems, PE_root, myteam->start, myteam->stride, myteam->size, - psync, 1); + psync, 1, nic_idx); shmem_internal_team_release_psyncs(myteam, BCAST); int team_root = myteam->start + PE_root * myteam->stride; if (shmem_internal_my_pe == team_root && dest != source) - shmem_internal_copy_self(dest, source, nelems); + shmem_internal_copy_self(dest, source, nelems, nic_idx); return 0; } @@ -353,16 +374,19 @@ shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, BCAST); \ + long *psync = shmem_internal_team_choose_psync(myteam, BCAST, \ + nic_idx); \ shmem_internal_bcast(dest, source, nelems * sizeof(TYPE), \ PE_root, myteam->start, myteam->stride, \ - myteam->size, psync, 1); \ + myteam->size, psync, 1, nic_idx); \ shmem_internal_team_release_psyncs(myteam, BCAST); \ int team_root = myteam->start + PE_root * myteam->stride; \ if (shmem_internal_my_pe == team_root && dest != source) { \ shmem_internal_copy_self(dest, source, \ - nelems * sizeof(TYPE)); \ + nelems * sizeof(TYPE), nic_idx); \ } \ return 0; \ } @@ -380,8 +404,10 @@ shmem_collect32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_collect(target, source, nlong * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -396,8 +422,10 @@ shmem_collect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_collect(target, source, nlong * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_COLLECT(STYPE,TYPE) \ @@ -412,12 +440,15 @@ shmem_collect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - COLLECT); \ + COLLECT, \ + nic_idx); \ shmem_internal_collect(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, COLLECT); \ return 0; \ } @@ -434,10 +465,12 @@ shmem_collectmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT, nic_idx); shmem_internal_collect(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, COLLECT); return 0; } @@ -453,8 +486,10 @@ shmem_fcollect32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_fcollect(target, source, nlong * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -469,8 +504,10 @@ shmem_fcollect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_fcollect(target, source, nlong * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_FCOLLECT(STYPE,TYPE) \ @@ -485,12 +522,15 @@ shmem_fcollect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - COLLECT); \ + COLLECT, \ + nic_idx); \ shmem_internal_fcollect(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, COLLECT); \ return 0; \ } @@ -507,10 +547,12 @@ shmem_fcollectmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT, nic_idx); shmem_internal_fcollect(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, COLLECT); return 0; } @@ -526,8 +568,10 @@ shmem_alltoall32(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * 4, nelems * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoall(dest, source, nelems * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -542,8 +586,10 @@ shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * 8, nelems * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoall(dest, source, nelems * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_ALLTOALL(STYPE,TYPE) \ @@ -558,12 +604,15 @@ shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - ALLTOALL); \ + ALLTOALL, \ + nic_idx); \ shmem_internal_alltoall(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ return 0; \ } @@ -580,10 +629,12 @@ shmem_alltoallmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, nic_idx); shmem_internal_alltoall(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, ALLTOALL); return 0; } @@ -602,8 +653,10 @@ shmem_alltoalls32(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(source, 4 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 4, nelems, PE_start, - 1 << logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -620,8 +673,10 @@ shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(source, 8 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 8, nelems, PE_start, - 1 << logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_ALLTOALLS(STYPE,TYPE) \ @@ -635,11 +690,14 @@ shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); \ + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, \ + nic_idx); \ shmem_internal_alltoalls(dest, source, dst, sst, sizeof(TYPE), \ nelems, myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ return 0; \ } @@ -655,11 +713,13 @@ shmem_alltoallsmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 1, nelems, myteam->start, myteam->stride, myteam->size, - psync); + psync, nic_idx); shmem_internal_team_release_psyncs(myteam, ALLTOALL); return 0; } diff --git a/src/init.c b/src/init.c index 01ca23dfd..b7480ebe2 100644 --- a/src/init.c +++ b/src/init.c @@ -143,7 +143,9 @@ shmem_internal_shutdown(void) return; } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); shmem_internal_finalized = 1; diff --git a/src/lock_c.c b/src/lock_c.c index 269dd0ed2..7008dd8f8 100644 --- a/src/lock_c.c +++ b/src/lock_c.c @@ -44,7 +44,9 @@ shmem_clear_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - shmem_internal_clear_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_clear_lock(lockp, nic_idx); } @@ -54,7 +56,9 @@ shmem_set_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - shmem_internal_set_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_set_lock(lockp, nic_idx); } @@ -64,5 +68,7 @@ shmem_test_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - return shmem_internal_test_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + return shmem_internal_test_lock(lockp, nic_idx); } diff --git a/src/shmem_collectives.h b/src/shmem_collectives.h index 6409c5178..acfae7b41 100644 --- a/src/shmem_collectives.h +++ b/src/shmem_collectives.h @@ -40,13 +40,13 @@ extern coll_type_t shmem_internal_reduce_type; extern coll_type_t shmem_internal_collect_type; extern coll_type_t shmem_internal_fcollect_type; -void shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync); -void shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync); -void shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync); +void shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); +void shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); +void shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); static inline void -shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx) { if (shmem_internal_params.BARRIERS_FLUSH) { fflush(stdout); @@ -58,19 +58,19 @@ shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) switch (shmem_internal_barrier_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { - shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync, nic_idx); } else { - shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync, nic_idx); } break; case LINEAR: - shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync, nic_idx); break; case TREE: - shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync, nic_idx); break; case DISSEM: - shmem_internal_sync_dissem(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_dissem(PE_start, PE_stride, PE_size, pSync, nic_idx); break; default: RAISE_ERROR_MSG("Illegal barrier/sync type (%d)\n", @@ -85,60 +85,64 @@ shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) static inline void -shmem_internal_sync_all(void) +shmem_internal_sync_all(size_t nic_idx) { - shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_sync_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_sync_all_psync, nic_idx); } static inline void -shmem_internal_barrier(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_barrier(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync, nic_idx); } static inline void -shmem_internal_barrier_all(void) +shmem_internal_barrier_all(size_t nic_idx) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_barrier_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_barrier_all_psync, nic_idx); } void shmem_internal_bcast_linear(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete); + long *pSync, int complete, size_t nic_idx); void shmem_internal_bcast_tree(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete); + long *pSync, int complete, size_t nic_idx); static inline void shmem_internal_bcast(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { switch (shmem_internal_bcast_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); } else { shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); } break; case LINEAR: shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); break; case TREE: shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); break; default: RAISE_ERROR_MSG("Illegal broadcast type (%d)\n", @@ -150,20 +154,24 @@ shmem_internal_bcast(void *target, const void *source, size_t len, void shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); static inline void @@ -171,7 +179,7 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(type_size > 0); @@ -181,21 +189,21 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_op_to_all_linear(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_tree(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } } else { if (count * type_size < shmem_internal_params.COLL_SIZE_CROSSOVER) shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); else shmem_internal_op_to_all_ring(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; @@ -203,33 +211,33 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_linear(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; case RING: shmem_internal_op_to_all_ring(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); break; case TREE: if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_tree(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; case RECDBL: shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); break; default: RAISE_ERROR_MSG("Illegal reduction type (%d)\n", @@ -239,21 +247,23 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); static inline void shmem_internal_collect(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { switch (shmem_internal_collect_type) { case AUTO: shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case LINEAR: shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; default: RAISE_ERROR_MSG("Illegal collect type (%d)\n", @@ -263,37 +273,41 @@ shmem_internal_collect(void *target, const void *source, size_t len, void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); static inline void shmem_internal_fcollect(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { switch (shmem_internal_fcollect_type) { case AUTO: shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case LINEAR: shmem_internal_fcollect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case RING: shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case RECDBL: if (0 == (PE_size & (PE_size - 1))) { shmem_internal_fcollect_recdbl(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); } else { shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); } break; default: @@ -304,9 +318,11 @@ shmem_internal_fcollect(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); #endif diff --git a/src/shmem_lock.h b/src/shmem_lock.h index 158cafc84..ca78e5410 100644 --- a/src/shmem_lock.h +++ b/src/shmem_lock.h @@ -37,7 +37,7 @@ typedef struct lock_t lock_t; static inline void -shmem_internal_clear_lock(long *lockp) +shmem_internal_clear_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, cond, zero = 0, sig = SIGNAL_MASK; @@ -47,8 +47,8 @@ shmem_internal_clear_lock(long *lockp) /* release the lock if I'm the last to try to obtain it */ cond = shmem_internal_my_pe + 1; shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &zero, &curr, &cond, - sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + sizeof(int), 0, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? /* if local PE was not the last to hold the lock, look for the next in line */ if (curr != shmem_internal_my_pe + 1) { @@ -58,8 +58,8 @@ shmem_internal_clear_lock(long *lockp) for (;;) { shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), sizeof(int), shmem_internal_my_pe, - SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (NEXT(cur_data) != 0) break; @@ -69,21 +69,21 @@ shmem_internal_clear_lock(long *lockp) /* set the signal bit on new lock holder */ shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &sig, &curr, - &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT, 0);// Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT, nic_idx);// Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? } } static inline void -shmem_internal_set_lock(long *lockp) +shmem_internal_set_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, zero = 0, me = shmem_internal_my_pe + 1; /* initialize my elements to zero */ shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, &(lock->data), &zero, - sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT); + sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* update last with my value to add me to the queue */ @@ -96,16 +96,16 @@ shmem_internal_set_lock(long *lockp) int next_mask = NEXT_MASK; shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &me, &curr, - &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? /* now wait for the signal part of data to be non-zero */ for (;;) { int cur_data; shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), - sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (SIGNAL(cur_data) != 0) break; @@ -122,20 +122,20 @@ shmem_internal_set_lock(long *lockp) static inline int -shmem_internal_test_lock(long *lockp) +shmem_internal_test_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, me = shmem_internal_my_pe + 1, zero = 0; /* initialize my elements to zero */ shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, &(lock->data), &zero, - sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT); + sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* add self to last if and only if the lock is zero (ie, no one has the lock) */ shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &me, &curr, &zero, - sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + sizeof(int), 0, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (0 == curr) { shmem_internal_membar_acquire(); diff --git a/src/shmem_team.c b/src/shmem_team.c index f0476ab6b..833a03a17 100644 --- a/src/shmem_team.c +++ b/src/shmem_team.c @@ -289,7 +289,7 @@ int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, int PE_size, const shmem_team_config_t *config, long config_mask, - shmem_internal_team_t **new_team) + shmem_internal_team_t **new_team, size_t nic_idx) { *new_team = SHMEM_TEAM_INVALID; @@ -320,7 +320,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE int my_pe = shmem_internal_pe_in_active_set(shmem_internal_my_pe, global_PE_start, PE_stride, PE_size); - long *psync = shmem_internal_team_choose_psync(parent_team, REDUCE); + long *psync = shmem_internal_team_choose_psync(parent_team, REDUCE, nic_idx); shmem_internal_team_t *myteam = NULL; *team_ret_val = 0; *team_ret_val_reduced = 0; @@ -366,7 +366,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE shmem_internal_op_to_all(psync_pool_avail_reduced, psync_pool_avail, N_PSYNC_BYTES, 1, myteam->start, PE_stride, PE_size, NULL, - psync, SHM_INTERNAL_BAND, SHM_INTERNAL_UCHAR); + psync, SHM_INTERNAL_BAND, SHM_INTERNAL_UCHAR, nic_idx); /* We cannot release the psync here, because this reduction may not * have been performed on the entire parent team. */ @@ -406,18 +406,18 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE /* This barrier on the parent team eliminates problematic race conditions * during psync allocation between back-to-back team creations. */ - psync = shmem_internal_team_choose_psync(parent_team, SYNC); + psync = shmem_internal_team_choose_psync(parent_team, SYNC, nic_idx); - shmem_internal_barrier(parent_team->start, parent_team->stride, parent_team->size, psync); + shmem_internal_barrier(parent_team->start, parent_team->stride, parent_team->size, psync, nic_idx); shmem_internal_team_release_psyncs(parent_team, SYNC); /* This OR reduction assures all PEs return the same value. */ - psync = shmem_internal_team_choose_psync(parent_team, REDUCE); + psync = shmem_internal_team_choose_psync(parent_team, REDUCE, nic_idx); shmem_internal_op_to_all(team_ret_val_reduced, team_ret_val, 1, sizeof(int), parent_team->start, parent_team->stride, parent_team->size, NULL, - psync, SHM_INTERNAL_MAX, SHM_INTERNAL_INT); + psync, SHM_INTERNAL_MAX, SHM_INTERNAL_INT, nic_idx); shmem_internal_team_release_psyncs(parent_team, REDUCE); @@ -433,7 +433,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, const shmem_team_config_t *xaxis_config, long xaxis_mask, shmem_internal_team_t **xaxis_team, const shmem_team_config_t *yaxis_config, - long yaxis_mask, shmem_internal_team_t **yaxis_team) + long yaxis_mask, shmem_internal_team_t **yaxis_team, size_t nic_idx) { *xaxis_team = SHMEM_TEAM_INVALID; *yaxis_team = SHMEM_TEAM_INVALID; @@ -460,7 +460,8 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, int xsize = (i == num_xteams - 1 && parent_size % xrange) ? parent_size % xrange : xrange; ret = shmem_internal_team_split_strided(parent_team, start, parent_stride, - xsize, xaxis_config, xaxis_mask, &my_xteam); + xsize, xaxis_config, xaxis_mask, &my_xteam, + nic_idx); if (ret) { RAISE_ERROR_MSG("Creation of x-axis team %d of %d failed\n", i+1, num_xteams); } @@ -481,7 +482,8 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, int ysize = (remainder && i < remainder) ? yrange + 1 : yrange; ret = shmem_internal_team_split_strided(parent_team, start, xrange*parent_stride, - ysize, yaxis_config, yaxis_mask, &my_yteam); + ysize, yaxis_config, yaxis_mask, &my_yteam, + nic_idx); if (ret) { RAISE_ERROR_MSG("Creation of y-axis team %d of %d failed\n", i+1, num_yteams); } @@ -493,9 +495,9 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, } } - long *psync = shmem_internal_team_choose_psync(parent_team, SYNC); + long *psync = shmem_internal_team_choose_psync(parent_team, SYNC, nic_idx); - shmem_internal_barrier(parent_start, parent_stride, parent_size, psync); + shmem_internal_barrier(parent_start, parent_stride, parent_size, psync, nic_idx); shmem_internal_team_release_psyncs(parent_team, SYNC); @@ -535,7 +537,7 @@ int shmem_internal_team_destroy(shmem_internal_team_t *team) /* Returns a psync from the given team that can be safely used for the * specified collective operation. */ -long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op) +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op, size_t nic_idx) { switch (op) { @@ -556,7 +558,7 @@ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_inter size_t psync = team->psync_idx * SHMEM_SYNC_SIZE; shmem_internal_sync(team->start, team->stride, team->size, - &shmem_internal_psync_barrier_pool[psync]); + &shmem_internal_psync_barrier_pool[psync], nic_idx); for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { team->psync_avail[i] = 1; diff --git a/src/shmem_team.h b/src/shmem_team.h index 195730864..bf006c8b6 100644 --- a/src/shmem_team.h +++ b/src/shmem_team.h @@ -58,11 +58,12 @@ int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, int PE_size, const shmem_team_config_t *config, long config_mask, - shmem_internal_team_t **new_team); + shmem_internal_team_t **new_team, size_t nic_idx); int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, const shmem_team_config_t *xaxis_config, long xaxis_mask, shmem_internal_team_t **xaxis_team, - const shmem_team_config_t *yaxis_config, long yaxis_mask, shmem_internal_team_t **yaxis_team); + const shmem_team_config_t *yaxis_config, long yaxis_mask, shmem_internal_team_t **yaxis_team, + size_t nic_idx); int shmem_internal_team_destroy(shmem_internal_team_t *team); @@ -70,7 +71,7 @@ int shmem_internal_team_create_ctx(shmem_internal_team_t *team, long options, sh int shmem_internal_ctx_get_team(shmem_ctx_t ctx, shmem_internal_team_t **team); -long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op); +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op, size_t nic_idx); void shmem_internal_team_release_psyncs(shmem_internal_team_t *team, shmem_internal_team_op_t op); diff --git a/src/symmetric_heap_c.c b/src/symmetric_heap_c.c index 30b319ea9..176f4d01b 100644 --- a/src/symmetric_heap_c.c +++ b/src/symmetric_heap_c.c @@ -295,7 +295,9 @@ shmem_malloc(size_t size) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -313,7 +315,9 @@ shmem_calloc(size_t count, size_t size) ret = dlcalloc(count, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -326,7 +330,9 @@ shmem_free(void *ptr) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); shmem_internal_free(ptr); } @@ -344,7 +350,9 @@ shmem_realloc(void *ptr, size_t size) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); SHMEM_MUTEX_LOCK(shmem_internal_mutex_alloc); if (size == 0 && ptr != NULL) { @@ -355,7 +363,7 @@ shmem_realloc(void *ptr, size_t size) } SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -376,7 +384,9 @@ shmem_align(size_t alignment, size_t size) ret = dlmemalign(alignment, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -430,9 +440,11 @@ shmem_malloc_with_hints(size_t size, long hints) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) - shmem_internal_barrier_all(); - + if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) { + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); + } return ret; } diff --git a/src/teams_c.c4 b/src/teams_c.c4 index c86065f73..1c0e3aa7f 100644 --- a/src/teams_c.c4 +++ b/src/teams_c.c4 @@ -115,9 +115,12 @@ shmem_team_split_strided(shmem_team_t parent_team, int PE_start, { SHMEM_ERR_CHECK_INITIALIZED(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); return shmem_internal_team_split_strided((shmem_internal_team_t *)parent_team, PE_start, PE_stride, PE_size, config, - config_mask, (shmem_internal_team_t **)new_team); + config_mask, (shmem_internal_team_t **)new_team, + nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES @@ -128,11 +131,14 @@ shmem_team_split_2d(shmem_team_t parent_team, int xrange, { SHMEM_ERR_CHECK_INITIALIZED(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); return shmem_internal_team_split_2d((shmem_internal_team_t *)parent_team, xrange, xaxis_config, xaxis_mask, (shmem_internal_team_t **)xaxis_team, yaxis_config, yaxis_mask, - (shmem_internal_team_t **)yaxis_team); + (shmem_internal_team_t **)yaxis_team, + nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES