DRAFT : Register memory#1010
Conversation
|
Can one of the admins verify this patch? |
|
|
||
| typedef struct node_ucc_ee_executor_task node_ucc_ee_executor_task_t; | ||
| typedef struct node_ucc_ee_executor_task { | ||
| ucc_ee_executor_task_t *val; |
| } \ | ||
| } while(0) | ||
|
|
||
| // #define EXEC_TASK_TEST_2(_errmsg, _etask) do { |
| while (polls++ < task->n_polls) { | ||
| node_ucc_ee_executor_task_t *current_node; | ||
| current_node = task->allgather_kn.etask_linked_list_head; | ||
| while(current_node != NULL) { |
There was a problem hiding this comment.
Ask Sergey should this be put inside / outside of loop
There was a problem hiding this comment.
what are you talking about ? I don't remember
There was a problem hiding this comment.
performance wise, should we iterate the list of etasks inside the while loop, or outside (1 TIME OR N_POLL TIME)
There was a problem hiding this comment.
almost no performance difference according to Sergey
| status = ucc_ee_executor_task_test(current_node->val); \ | ||
| if (status > 0) { \ | ||
| ucc_ee_executor_task_finalize(current_node->val); \ | ||
| ucp_memcpy_device_complete(current_node->val->completion, status); |
There was a problem hiding this comment.
have to come before finalize
| node_ucc_ee_executor_task_t *current_node; | ||
| current_node = task->allgather_kn.etask_linked_list_head; | ||
| while(current_node != NULL) { | ||
| if (current_node->val != NULL) { |
| task->allgather_kn.etask_linked_list_head = (node_ucc_ee_executor_task_t *)malloc(sizeof(node_ucc_ee_executor_task_t)); | ||
| task->allgather_kn.etask_linked_list_head->val = NULL; |
There was a problem hiding this comment.
task->allgather_kn.etask_linked_list_head = NULL
| ucc_tl_ucp_context_t *ctx = UCC_TL_UCP_TEAM_CTX(team); | ||
| ucp_mem_map_params_t mmap_params; | ||
| ucp_mem_h mh; | ||
| int size_of_list = 6; |
| UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE; | ||
| mmap_params.memory_type = ucc_memtype_to_ucs[mem_type]; | ||
|
|
||
| // EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test", |
|
|
||
| mmap_params.address = task->allgather_kn.sbuf; | ||
| mmap_params.length = local * dt_size; | ||
| UCPCHECK_GOTO(ucs_status_to_ucc_status(ucp_mem_map(ctx->worker.ucp_context, &mmap_params, &mh)), |
There was a problem hiding this comment.
dst is not important? ask Sergey
ucc_tl_ucp_send_nb(void *buffer, size_t msglen, ucc_memory_type_t mtype,
ucc_rank_t dest_group_rank, ucc_tl_ucp_team_t *team,
ucc_tl_ucp_task_t *task)
There was a problem hiding this comment.
dest isnt important
| mmap_params.address = rbuf; | ||
| mmap_params.length = data_size; | ||
| UCPCHECK_GOTO(ucs_status_to_ucc_status(ucp_mem_map(ctx->worker.ucp_context, &mmap_params, &mh)), | ||
| task, out); | ||
| if (count_mh == size_of_list){ | ||
| size_of_list *= 2; | ||
| mh_list = (ucp_mem_h *)realloc(mh_list, size_of_list * sizeof(ucp_mem_h)); | ||
| } | ||
| mh_list[count_mh++] = mh; |
There was a problem hiding this comment.
make sure all cases are perfectly identical logic (double list size etc), try to create a macro if possible
There was a problem hiding this comment.
what isnt identical, I'm not sure to understand
| } | ||
| mh_list[count_mh++] = mh; | ||
| } | ||
|
|
There was a problem hiding this comment.
| task->allgather_kn.etask = NULL; | ||
| task->allgather_kn.etask_linked_list_head->etask); | ||
|
|
||
| // task->allgather_kn.etask_linked_list_head = NULL; |
|
|
||
| EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test", | ||
| task->allgather_kn.etask); | ||
| task->allgather_kn.etask = NULL; |
There was a problem hiding this comment.
a BUG - you can't remove it. its needed
| team, task), | ||
| team, task, mh_list[count_mh++]), | ||
| task, out); | ||
| if (count_mh >= max_count){ |
There was a problem hiding this comment.
there is a function ucc_assert for that look what its doing
There was a problem hiding this comment.
why u chose to out that check here?
There was a problem hiding this comment.
should I just raise an error maybe ? How ?
There was a problem hiding this comment.
if I use ucc_assert, I can't add my print right ?
| task, out); | ||
| if (count_mh >= max_count){ | ||
| printf("count_mh is bigger than it should (%d >= %d).", count_mh, max_count); | ||
| goto out; |
There was a problem hiding this comment.
you mean even with ucc_assert ?
| team, task), | ||
| team, task, mh_list[count_mh++]), | ||
| task, out); | ||
| if (count_mh >= max_count){ |
| status = ucc_coll_task_get_executor(&task->super, &exec); | ||
| if (ucc_unlikely(status != UCC_OK)) { | ||
| task->super.status = status; | ||
| return -1; |
There was a problem hiding this comment.
maybe u can simply return uuc_status? check its values, it's enum so it's int eventually
|
|
||
| if (ucc_unlikely(status != UCC_OK)) { | ||
| task->super.status = status; | ||
| return -1; |
| task->allgather_kn.etask_linked_list_head = new_node; | ||
|
|
||
| task->allgather_kn.etask_linked_list_head->etask->completion = completion; | ||
| return 1; |
| status = ucc_ee_executor_task_test(etask); | ||
| while (status>0) { | ||
| status = ucc_ee_executor_task_test(etask); | ||
| // if (ucc_unlikely(status < 0)) { |
| #ifndef UCC_SCHEDULE_H_ | ||
| #define UCC_SCHEDULE_H_ | ||
|
|
||
| #include <ucp/api/ucp.h> |
There was a problem hiding this comment.
I think so for the ucp_mem_map
| ucp_mem_h *mh_list; | ||
| int count_mh; |
There was a problem hiding this comment.
Why ? Is it a bug ?
| ucc_sbgp_t *sbgp; | ||
|
|
||
| task = ucc_tl_ucp_init_task(coll_args, team); | ||
| ucc_mpool_init(&task->allgather_kn.etask_node_mpool, 0, sizeof(node_ucc_ee_executor_task_t), |
There was a problem hiding this comment.
u didn't check for status
There was a problem hiding this comment.
what should I do if status<0 ?
| task->subset.myrank = sbgp->group_rank; | ||
| task->subset.map = sbgp->map; | ||
| } | ||
| register_memory(&task->super); |
There was a problem hiding this comment.
u didn't check for status
| #define MEM_MAP() do { \ | ||
| status = ucp_mem_map(ctx->worker.ucp_context, &mmap_params, &mh); \ | ||
| if (UCC_OK != status) { \ | ||
| task->super.status = status; \ |
There was a problem hiding this comment.
no, just return status and error
ucc/src/components/tl/nccl/tl_nccl_context.c
Line 196 in 2359644
There was a problem hiding this comment.
I should keep this self->super.super.lib or it's something else for here ?
| node_ucc_ee_executor_task_t *etask_linked_list_head; | ||
| ucc_rank_t recv_dist; | ||
| ucc_mpool_t etask_node_mpool; | ||
| ucc_ee_executor_task_t *etask; |
There was a problem hiding this comment.
move back to original place
| if (UCC_INPROGRESS == ucc_tl_ucp_test_with_etasks(task)) { | ||
| SAVE_STATE(UCC_KN_PHASE_PROXY); | ||
| return; | ||
| } |
There was a problem hiding this comment.
add in out
|ucc_assert(count_mh == max_count) or smth similar
| size_t local = GET_LOCAL_COUNT(args, size, rank); | ||
| ucp_mem_h *mh_list = task->mh_list; | ||
| int max_count = task->count_mh; | ||
| int count_mh = 0; |
There was a problem hiding this comment.
BUG - this is a mistake, because u want to keep it when u go in and out from progress
There was a problem hiding this comment.
right but so how do I initialize it ?
|
|
||
| EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test", | ||
| task->allgather_kn.etask); | ||
|
|
|
|
||
| UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_kn_start", 0); | ||
| ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); | ||
| task->allgather_kn.etask = NULL; |
|
|
||
| if (KN_NODE_EXTRA == node_type) { | ||
| if (p->type != KN_PATTERN_ALLGATHERX) { | ||
|
|
There was a problem hiding this comment.
no need for all of the line spaces
6142008 to
3ca742c
Compare
What
add callback function to replace cuda copy
Why ?
There is a deadlock when executing collectives in AI workload