Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pml/ob1: make no. of events an mca parameter #12672

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ompi/mca/pml/ob1/pml_ob1.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ typedef struct mca_pml_ob1_t mca_pml_ob1_t;
extern mca_pml_ob1_t mca_pml_ob1;
extern int mca_pml_ob1_output;
extern bool mca_pml_ob1_matching_protection;
extern int mca_pml_ob1_accelerator_events_max;

/*
* PML interface functions.
*/
Expand Down
27 changes: 13 additions & 14 deletions ompi/mca/pml/ob1/pml_ob1_accelerator.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ static int accelerator_event_dtoh_first_used, accelerator_event_htod_first_used;
static volatile int accelerator_event_dtoh_num_used, accelerator_event_htod_num_used;

/* Size of array holding events */
static int accelerator_event_max = 400;
static int accelerator_event_htod_most = 0;

int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
Expand All @@ -87,9 +86,9 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f
* return an error. The error message will tell the user to try and
* run again, but with a larger array for storing events. */
OPAL_THREAD_LOCK(&pml_ob1_accelerator_htod_lock);
if (accelerator_event_htod_num_used == accelerator_event_max) {
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca mpi_common_accelerator_event_max %d.",
accelerator_event_max, accelerator_event_max + 100);
if (accelerator_event_htod_num_used == mca_pml_ob1_accelerator_events_max) {
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca pml_ob1_accelerator_events_max %d.",
mca_pml_ob1_accelerator_events_max, mca_pml_ob1_accelerator_events_max + 100);
OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
}
Expand All @@ -113,7 +112,7 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f

/* Bump up the first available slot and number used by 1 */
accelerator_event_htod_first_avail++;
if (accelerator_event_htod_first_avail >= accelerator_event_max) {
if (accelerator_event_htod_first_avail >= mca_pml_ob1_accelerator_events_max) {
accelerator_event_htod_first_avail = 0;
}
accelerator_event_htod_num_used++;
Expand Down Expand Up @@ -169,7 +168,7 @@ int mca_pml_ob1_progress_one_htod_event(struct mca_btl_base_descriptor_t **frag)
/* Bump counters, loop around the circular buffer if necessary */
--accelerator_event_htod_num_used;
++accelerator_event_htod_first_used;
if (accelerator_event_htod_first_used >= accelerator_event_max) {
if (accelerator_event_htod_first_used >= mca_pml_ob1_accelerator_events_max) {
accelerator_event_htod_first_used = 0;
}
/* A return value of 1 indicates an event completed and a frag was returned */
Expand Down Expand Up @@ -214,15 +213,15 @@ int mca_pml_ob1_accelerator_init(void)
accelerator_event_dtoh_first_avail = 0;
accelerator_event_dtoh_first_used = 0;

accelerator_event_dtoh_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_dtoh_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_dtoh_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_dtoh_array[i]);
if (OPAL_SUCCESS != result) {
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
Expand All @@ -234,7 +233,7 @@ int mca_pml_ob1_accelerator_init(void)
/* The first available status index is 0. Make an empty frag
array. */
accelerator_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
if (NULL == accelerator_event_dtoh_frag_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
Expand All @@ -247,15 +246,15 @@ int mca_pml_ob1_accelerator_init(void)
accelerator_event_htod_first_avail = 0;
accelerator_event_htod_first_used = 0;

accelerator_event_htod_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_htod_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_htod_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_htod_array[i]);
if (OPAL_SUCCESS != result) {
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
Expand All @@ -267,7 +266,7 @@ int mca_pml_ob1_accelerator_init(void)
/* The first available status index is 0. Make an empty frag
array. */
accelerator_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
if (NULL == accelerator_event_htod_frag_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
Expand Down Expand Up @@ -304,7 +303,7 @@ void mca_pml_ob1_accelerator_fini(void)
}

if (NULL != accelerator_event_htod_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
if (NULL != accelerator_event_htod_array[i]) {
OBJ_RELEASE(accelerator_event_htod_array[i]);
}
Expand All @@ -313,7 +312,7 @@ void mca_pml_ob1_accelerator_fini(void)
}

if (NULL != accelerator_event_dtoh_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
if (NULL != accelerator_event_dtoh_array[i]) {
OBJ_RELEASE(accelerator_event_dtoh_array[i]);
}
Expand Down
7 changes: 7 additions & 0 deletions ompi/mca/pml/ob1/pml_ob1_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ static int mca_pml_ob1_component_fini(void);
int mca_pml_ob1_output = 0;
static int mca_pml_ob1_verbose = 0;
bool mca_pml_ob1_matching_protection = false;
int mca_pml_ob1_accelerator_events_max = 400;

mca_pml_base_component_2_1_0_t mca_pml_ob1_component = {
/* First, the mca_base_component_t struct containing meta
Expand Down Expand Up @@ -242,6 +243,12 @@ static int mca_pml_ob1_component_register(void)
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
mca_pml_ob1_get_posted_recvq_size, NULL, mca_pml_ob1_comm_size_notify, NULL);

mca_pml_ob1_accelerator_events_max = 400;
(void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "accelerator_events_max",
"Number of events created by the ob1 component internally",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_ob1_accelerator_events_max);

return OMPI_SUCCESS;
}

Expand Down
Loading