Skip to content

Commit

Permalink
Merge pull request #12672 from edgargabriel/pr/ob1-max-events-param-v…
Browse files Browse the repository at this point in the history
…5.0.x

pml/ob1: make no. of events an mca parameter
  • Loading branch information
wenduwan authored Jul 11, 2024
2 parents 8e848da + 2d1a2fa commit 7c7c995
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 14 deletions.
2 changes: 2 additions & 0 deletions ompi/mca/pml/ob1/pml_ob1.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ typedef struct mca_pml_ob1_t mca_pml_ob1_t;
extern mca_pml_ob1_t mca_pml_ob1;
extern int mca_pml_ob1_output;
extern bool mca_pml_ob1_matching_protection;
extern int mca_pml_ob1_accelerator_events_max;

/*
* PML interface functions.
*/
Expand Down
27 changes: 13 additions & 14 deletions ompi/mca/pml/ob1/pml_ob1_accelerator.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ static int accelerator_event_dtoh_first_used, accelerator_event_htod_first_used;
static volatile int accelerator_event_dtoh_num_used, accelerator_event_htod_num_used;

/* Size of array holding events */
static int accelerator_event_max = 400;
static int accelerator_event_htod_most = 0;

int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
Expand All @@ -87,9 +86,9 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f
* return an error. The error message will tell the user to try and
* run again, but with a larger array for storing events. */
OPAL_THREAD_LOCK(&pml_ob1_accelerator_htod_lock);
if (accelerator_event_htod_num_used == accelerator_event_max) {
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca mpi_common_accelerator_event_max %d.",
accelerator_event_max, accelerator_event_max + 100);
if (accelerator_event_htod_num_used == mca_pml_ob1_accelerator_events_max) {
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca pml_ob1_accelerator_events_max %d.",
mca_pml_ob1_accelerator_events_max, mca_pml_ob1_accelerator_events_max + 100);
OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
}
Expand All @@ -113,7 +112,7 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f

/* Bump up the first available slot and number used by 1 */
accelerator_event_htod_first_avail++;
if (accelerator_event_htod_first_avail >= accelerator_event_max) {
if (accelerator_event_htod_first_avail >= mca_pml_ob1_accelerator_events_max) {
accelerator_event_htod_first_avail = 0;
}
accelerator_event_htod_num_used++;
Expand Down Expand Up @@ -169,7 +168,7 @@ int mca_pml_ob1_progress_one_htod_event(struct mca_btl_base_descriptor_t **frag)
/* Bump counters, loop around the circular buffer if necessary */
--accelerator_event_htod_num_used;
++accelerator_event_htod_first_used;
if (accelerator_event_htod_first_used >= accelerator_event_max) {
if (accelerator_event_htod_first_used >= mca_pml_ob1_accelerator_events_max) {
accelerator_event_htod_first_used = 0;
}
/* A return value of 1 indicates an event completed and a frag was returned */
Expand Down Expand Up @@ -214,15 +213,15 @@ int mca_pml_ob1_accelerator_init(void)
accelerator_event_dtoh_first_avail = 0;
accelerator_event_dtoh_first_used = 0;

accelerator_event_dtoh_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_dtoh_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_dtoh_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_dtoh_array[i]);
if (OPAL_SUCCESS != result) {
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
Expand All @@ -234,7 +233,7 @@ int mca_pml_ob1_accelerator_init(void)
/* The first available status index is 0. Make an empty frag
array. */
accelerator_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
if (NULL == accelerator_event_dtoh_frag_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
Expand All @@ -247,15 +246,15 @@ int mca_pml_ob1_accelerator_init(void)
accelerator_event_htod_first_avail = 0;
accelerator_event_htod_first_used = 0;

accelerator_event_htod_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_htod_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_htod_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_htod_array[i]);
if (OPAL_SUCCESS != result) {
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
Expand All @@ -267,7 +266,7 @@ int mca_pml_ob1_accelerator_init(void)
/* The first available status index is 0. Make an empty frag
array. */
accelerator_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
if (NULL == accelerator_event_htod_frag_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
Expand Down Expand Up @@ -304,7 +303,7 @@ void mca_pml_ob1_accelerator_fini(void)
}

if (NULL != accelerator_event_htod_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
if (NULL != accelerator_event_htod_array[i]) {
OBJ_RELEASE(accelerator_event_htod_array[i]);
}
Expand All @@ -313,7 +312,7 @@ void mca_pml_ob1_accelerator_fini(void)
}

if (NULL != accelerator_event_dtoh_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
if (NULL != accelerator_event_dtoh_array[i]) {
OBJ_RELEASE(accelerator_event_dtoh_array[i]);
}
Expand Down
7 changes: 7 additions & 0 deletions ompi/mca/pml/ob1/pml_ob1_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ static int mca_pml_ob1_component_fini(void);
int mca_pml_ob1_output = 0;
static int mca_pml_ob1_verbose = 0;
bool mca_pml_ob1_matching_protection = false;
int mca_pml_ob1_accelerator_events_max = 400;

mca_pml_base_component_2_1_0_t mca_pml_ob1_component = {
/* First, the mca_base_component_t struct containing meta
Expand Down Expand Up @@ -242,6 +243,12 @@ static int mca_pml_ob1_component_register(void)
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
mca_pml_ob1_get_posted_recvq_size, NULL, mca_pml_ob1_comm_size_notify, NULL);

mca_pml_ob1_accelerator_events_max = 400;
(void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "accelerator_events_max",
"Number of events created by the ob1 component internally",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_ob1_accelerator_events_max);

return OMPI_SUCCESS;
}

Expand Down

0 comments on commit 7c7c995

Please sign in to comment.