-
Notifications
You must be signed in to change notification settings - Fork 103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CL/HIER: Add allgatherv #1050
base: master
Are you sure you want to change the base?
CL/HIER: Add allgatherv #1050
Conversation
9e34573
to
c126e93
Compare
198da51
to
862cad6
Compare
0774618
to
1298163
Compare
Can one of the admins verify this patch? |
@aamirshafi Please review |
584b1b4
to
0305f14
Compare
0305f14
to
30f0ce1
Compare
if (ucc_unlikely(cl_team->node_leaders == NULL)) { | ||
cl_team->node_leaders = ucc_malloc(sizeof(ucc_rank_t) * team_size); | ||
if (!cl_team->node_leaders) { | ||
cl_error(team->context->lib, "Could not allocate node_leaders array"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return error
} | ||
cl_team->leader_list = ucc_malloc(sizeof(ucc_rank_t) * ldr_sbgp_size); | ||
if (!cl_team->node_leaders) { | ||
cl_error(team->context->lib, "Could not allocate leader_list array"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return error
args.args.src.info.mem_type = UCC_MEMORY_TYPE_HOST; | ||
args.args.dst.info_v.displacements = leader_disps; | ||
args.args.dst.info_v.counts = leader_counts; | ||
args.args.dst.info_v.buffer = args_old.args.dst.info_v.buffer; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here dst.Info_v.buffer size might be not enough to hold data because of gaps
cl_team->top_sbgp != UCC_HIER_SBGP_NODE) { | ||
args = args_old; | ||
args.args.coll_type = UCC_COLL_TYPE_BCAST; | ||
args.args.root = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change to leader rank id
0, UCC_MSG_MAX, UCC_CL_HIER_DEFAULT_SCORE, | ||
ucc_cl_hier_allgatherv_init, cl_team); | ||
if (UCC_OK != status) { | ||
cl_error(lib, "faild to add range to score_t"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo failed
args->dst.info_v.datatype); | ||
ucc_ee_executor_t *exec; | ||
ucc_status_t status; | ||
int i; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
int -> ucc_rank_t
} | ||
|
||
schedule->super.status = UCC_INPROGRESS; | ||
schedule->super.super.status = UCC_INPROGRESS; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need to set super.super.status
|
||
out: | ||
schedule->super.status = st; | ||
schedule->super.super.status = st; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't set it, it may lead to races with multiple threads
ucc_rank_t i; | ||
|
||
for (i = 0; i < *n_tasks; i++) { | ||
ucc_ee_executor_task_t *etask = tasks[i]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define variable at the beginning of function
ucc_schedule_t *schedule = ucc_derived_of(task, ucc_schedule_t); | ||
ucc_cl_hier_schedule_t *cl_schedule = ucc_derived_of(schedule, | ||
ucc_cl_hier_schedule_t); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ucc_schedule_t *schedule = ucc_derived_of(task, ucc_schedule_t); | |
ucc_cl_hier_schedule_t *cl_schedule = ucc_derived_of(schedule, | |
ucc_cl_hier_schedule_t); | |
ucc_cl_hier_schedule_t *cl_schedule = ucc_derived_of(task, | |
ucc_cl_hier_schedule_t); |
@@ -1,48 +1,55 @@ | |||
# | |||
# Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |||
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2025
return cl_team->node_leaders[team_rank]; | ||
} | ||
|
||
UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Sergei-Lebedev can you elaborate, is this something we simply don't support in allgatherv?
@@ -0,0 +1,15 @@ | |||
/** | |||
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2025
@@ -1,5 +1,5 @@ | |||
/** | |||
* Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |||
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2025
@@ -1,5 +1,5 @@ | |||
/** | |||
* Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |||
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2025
@@ -1,5 +1,5 @@ | |||
/** | |||
* Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |||
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2025
This PR adds CL/HIER allgatherv and updates the allgatherv gtest to test non-contiguous dst buffers. It is meant to be used in conjunction with my TL/SHM gatherv implementation.
The algorithm is:
Comparison Data
2 nodes, 32 PPN on lego-grace cg (1 socket, 72 cores/socket)