diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index ddfd4693c13..a0091793ab4 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -94,6 +94,8 @@ typedef struct mca_pml_ob1_t mca_pml_ob1_t; extern mca_pml_ob1_t mca_pml_ob1; extern int mca_pml_ob1_output; extern bool mca_pml_ob1_matching_protection; +extern int mca_pml_ob1_accelerator_events_max; + /* * PML interface functions. */ diff --git a/ompi/mca/pml/ob1/pml_ob1_accelerator.c b/ompi/mca/pml/ob1/pml_ob1_accelerator.c index 737560db302..5e2902a1d17 100644 --- a/ompi/mca/pml/ob1/pml_ob1_accelerator.c +++ b/ompi/mca/pml/ob1/pml_ob1_accelerator.c @@ -72,7 +72,6 @@ static int accelerator_event_dtoh_first_used, accelerator_event_htod_first_used; static volatile int accelerator_event_dtoh_num_used, accelerator_event_htod_num_used; /* Size of array holding events */ -static int accelerator_event_max = 400; static int accelerator_event_htod_most = 0; int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag) @@ -87,9 +86,9 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f * return an error. The error message will tell the user to try and * run again, but with a larger array for storing events. */ OPAL_THREAD_LOCK(&pml_ob1_accelerator_htod_lock); - if (accelerator_event_htod_num_used == accelerator_event_max) { - opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca mpi_common_accelerator_event_max %d.", - accelerator_event_max, accelerator_event_max + 100); + if (accelerator_event_htod_num_used == mca_pml_ob1_accelerator_events_max) { + opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca pml_ob1_accelerator_events_max %d.", + mca_pml_ob1_accelerator_events_max, mca_pml_ob1_accelerator_events_max + 100); OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock); return OPAL_ERR_OUT_OF_RESOURCE; } @@ -113,7 +112,7 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f /* Bump up the first available slot and number used by 1 */ accelerator_event_htod_first_avail++; - if (accelerator_event_htod_first_avail >= accelerator_event_max) { + if (accelerator_event_htod_first_avail >= mca_pml_ob1_accelerator_events_max) { accelerator_event_htod_first_avail = 0; } accelerator_event_htod_num_used++; @@ -169,7 +168,7 @@ int mca_pml_ob1_progress_one_htod_event(struct mca_btl_base_descriptor_t **frag) /* Bump counters, loop around the circular buffer if necessary */ --accelerator_event_htod_num_used; ++accelerator_event_htod_first_used; - if (accelerator_event_htod_first_used >= accelerator_event_max) { + if (accelerator_event_htod_first_used >= mca_pml_ob1_accelerator_events_max) { accelerator_event_htod_first_used = 0; } /* A return value of 1 indicates an event completed and a frag was returned */ @@ -214,7 +213,7 @@ int mca_pml_ob1_accelerator_init(void) accelerator_event_dtoh_first_avail = 0; accelerator_event_dtoh_first_used = 0; - accelerator_event_dtoh_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *)); + accelerator_event_dtoh_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *)); if (NULL == accelerator_event_dtoh_array) { opal_output_verbose(1, mca_pml_ob1_output, "No memory."); rc = OPAL_ERROR; @@ -222,7 +221,7 @@ int mca_pml_ob1_accelerator_init(void) } /* Create the events since they can be reused. */ - for (i = 0; i < accelerator_event_max; i++) { + for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) { result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_dtoh_array[i]); if (OPAL_SUCCESS != result) { opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed."); @@ -234,7 +233,7 @@ int mca_pml_ob1_accelerator_init(void) /* The first available status index is 0. Make an empty frag array. */ accelerator_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc( - sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max); + sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max); if (NULL == accelerator_event_dtoh_frag_array) { opal_output_verbose(1, mca_pml_ob1_output, "No memory."); rc = OPAL_ERROR; @@ -247,7 +246,7 @@ int mca_pml_ob1_accelerator_init(void) accelerator_event_htod_first_avail = 0; accelerator_event_htod_first_used = 0; - accelerator_event_htod_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *)); + accelerator_event_htod_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *)); if (NULL == accelerator_event_htod_array) { opal_output_verbose(1, mca_pml_ob1_output, "No memory."); rc = OPAL_ERROR; @@ -255,7 +254,7 @@ int mca_pml_ob1_accelerator_init(void) } /* Create the events since they can be reused. */ - for (i = 0; i < accelerator_event_max; i++) { + for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) { result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_htod_array[i]); if (OPAL_SUCCESS != result) { opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed."); @@ -267,7 +266,7 @@ int mca_pml_ob1_accelerator_init(void) /* The first available status index is 0. Make an empty frag array. */ accelerator_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc( - sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max); + sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max); if (NULL == accelerator_event_htod_frag_array) { opal_output_verbose(1, mca_pml_ob1_output, "No memory."); rc = OPAL_ERROR; @@ -304,7 +303,7 @@ void mca_pml_ob1_accelerator_fini(void) } if (NULL != accelerator_event_htod_array) { - for (i = 0; i < accelerator_event_max; i++) { + for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) { if (NULL != accelerator_event_htod_array[i]) { OBJ_RELEASE(accelerator_event_htod_array[i]); } @@ -313,7 +312,7 @@ void mca_pml_ob1_accelerator_fini(void) } if (NULL != accelerator_event_dtoh_array) { - for (i = 0; i < accelerator_event_max; i++) { + for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) { if (NULL != accelerator_event_dtoh_array[i]) { OBJ_RELEASE(accelerator_event_dtoh_array[i]); } diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index 4d8980859a8..32c99c26123 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -62,6 +62,7 @@ static int mca_pml_ob1_component_fini(void); int mca_pml_ob1_output = 0; static int mca_pml_ob1_verbose = 0; bool mca_pml_ob1_matching_protection = false; +int mca_pml_ob1_accelerator_events_max = 400; mca_pml_base_component_2_1_0_t mca_pml_ob1_component = { /* First, the mca_base_component_t struct containing meta @@ -242,6 +243,12 @@ static int mca_pml_ob1_component_register(void) MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, mca_pml_ob1_get_posted_recvq_size, NULL, mca_pml_ob1_comm_size_notify, NULL); + mca_pml_ob1_accelerator_events_max = 400; + (void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "accelerator_events_max", + "Number of events created by the ob1 component internally", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_ob1_accelerator_events_max); + return OMPI_SUCCESS; }