From de71f5a3cc71294d9844836b05e25ab9f915f6da Mon Sep 17 00:00:00 2001 From: Luke Robison Date: Fri, 18 Apr 2025 13:25:25 +0000 Subject: [PATCH 1/2] coll/han: disable alltoall for device buffers and MPI_IN_PLACE The han alltoall algorithm was never intended for an in-place alltoall and will crash if used in that configuration. Additionally it is not an efficient choice for device buffers. Fall back to another algorithm if either condition is met. Signed-off-by: Luke Robison --- ompi/mca/coll/han/coll_han_alltoall.c | 65 ++++++++++++++++++--------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/ompi/mca/coll/han/coll_han_alltoall.c b/ompi/mca/coll/han/coll_han_alltoall.c index 6437ea7a6af..e7d24e433f4 100644 --- a/ompi/mca/coll/han/coll_han_alltoall.c +++ b/ompi/mca/coll/han/coll_han_alltoall.c @@ -69,6 +69,10 @@ int mca_coll_han_alltoall_using_smsc( { mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + opal_convertor_t convertor; + int send_needs_bounce, have_device_buffer; + size_t packed_size = 0; + OPAL_OUTPUT_VERBOSE((90, mca_coll_han_component.han_output, "Entering mca_coll_han_alltoall_using_smsc\n")); @@ -82,6 +86,44 @@ int mca_coll_han_alltoall_using_smsc( comm, han_module->previous_alltoall_module); } + if (sbuf == MPI_IN_PLACE) { + /* This is not an in-place algorithm */ + return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, han_module->previous_alltoall_module); + } + + OBJ_CONSTRUCT( &convertor, opal_convertor_t ); + send_needs_bounce = 0; + have_device_buffer = 0; + /* get converter for copying to one of the leader ranks, and get packed size: */ + opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor); + have_device_buffer |= opal_convertor_on_device(&convertor); + send_needs_bounce |= opal_convertor_need_buffers(&convertor); + opal_convertor_cleanup(&convertor); + + opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor); + have_device_buffer |= opal_convertor_on_device(&convertor); + send_needs_bounce |= opal_convertor_need_buffers(&convertor); + opal_convertor_get_packed_size( &convertor, &packed_size ); + opal_convertor_cleanup(&convertor); + + if (have_device_buffer) { + /* + Although this algorithm is functional for device buffers, it requires an + extra copy through the bounce buffer that doesn't make it efficient. + Prefer another algorithm instead. + + Note that Open MPI makes assumptions that if one rank uses a device + buffer in a collective, then all ranks will use device buffers, so there + is no need to communicate before taking this branch. + */ + OBJ_DESTRUCT(&convertor); + return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, han_module->previous_alltoall_module); + } + + + /* Create the subcommunicators */ if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { opal_output_verbose(1, mca_coll_han_component.han_output, @@ -107,12 +149,11 @@ int mca_coll_han_alltoall_using_smsc( comm, han_module->previous_alltoall_module); } - int rc, send_needs_bounce, ii_push_data; + int rc, ii_push_data; size_t sndsize; MPI_Aint sextent, rextent, lb; - char *send_bounce; - opal_convertor_t convertor; - size_t packed_size = 0, packed_size_tmp; + char *send_bounce = NULL; + size_t packed_size_tmp; int use_isend; void *gather_buf_in[4]; int up_rank; @@ -140,22 +181,6 @@ int mca_coll_han_alltoall_using_smsc( } if (fanout > up_size) { fanout = up_size; } - OBJ_CONSTRUCT( &convertor, opal_convertor_t ); - - - send_needs_bounce = 0; - /* get converter for copying to one of the leader ranks, and get packed size: */ - opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor); - send_needs_bounce |= 0 != opal_convertor_on_device(&convertor); - send_needs_bounce |= opal_convertor_need_buffers(&convertor); - opal_convertor_cleanup(&convertor); - - opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor); - send_needs_bounce |= 0 != opal_convertor_on_device(&convertor); - send_needs_bounce |= opal_convertor_need_buffers(&convertor); - opal_convertor_get_packed_size( &convertor, &packed_size ); - opal_convertor_cleanup(&convertor); - /* Because push-mode needs extra synchronizations, we'd like to avoid it, however it might be necessary: From 7b7269d1d6148243c4687f2c2ad80974e0955f00 Mon Sep 17 00:00:00 2001 From: Luke Robison Date: Fri, 18 Apr 2025 14:27:35 +0000 Subject: [PATCH 2/2] coll/han: alltoall uses han freelist when possible As an optimization, when the han alltoall algorithm is forced to allocate a bounce buffer, and that bounce buffer is small enough, then use a buffer from the han_component.pack_buffers free list. Signed-off-by: Luke Robison --- ompi/mca/coll/han/coll_han_alltoall.c | 33 ++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/ompi/mca/coll/han/coll_han_alltoall.c b/ompi/mca/coll/han/coll_han_alltoall.c index e7d24e433f4..14b8cad51bf 100644 --- a/ompi/mca/coll/han/coll_han_alltoall.c +++ b/ompi/mca/coll/han/coll_han_alltoall.c @@ -72,6 +72,12 @@ int mca_coll_han_alltoall_using_smsc( opal_convertor_t convertor; int send_needs_bounce, have_device_buffer; size_t packed_size = 0; + enum { + BOUNCE_NOT_INITIALIZED = 0, + BOUNCE_IS_FROM_RBUF = 1, + BOUNCE_IS_FROM_FREELIST = 2, + BOUNCE_IS_FROM_MALLOC = 3, + }; OPAL_OUTPUT_VERBOSE((90, mca_coll_han_component.han_output, @@ -191,7 +197,7 @@ int mca_coll_han_alltoall_using_smsc( If the application buffer is device memory, we'll also need to exchange in push mode so that the process which has device registrations can - perform the reads. + perform the reads. (this mode has been disabled) In both of these cases, we'll need to use the bounce buffer too. */ @@ -211,19 +217,30 @@ int mca_coll_han_alltoall_using_smsc( inter_recv_reqs = malloc(sizeof(*inter_recv_reqs) * up_size ); char **low_bufs = malloc(low_size * sizeof(*low_bufs)); void **sbuf_map_ctx = malloc(low_size * sizeof(&sbuf_map_ctx)); + opal_free_list_item_t *send_fl_item = NULL; const int nptrs_gather = 3; void **gather_buf_out = calloc(low_size*nptrs_gather, sizeof(void*)); - bool send_bounce_is_allocated = false; + int send_bounce_status = BOUNCE_NOT_INITIALIZED; do { start_allgather: if ( 0 == send_needs_bounce ) { send_bounce = (char*)rbuf + up_rank*send_bytes_per_fan; + send_bounce_status = BOUNCE_IS_FROM_RBUF; } else { - if (!send_bounce_is_allocated) { - send_bounce = malloc(send_bytes_per_fan * fanout); - send_bounce_is_allocated = true; + if (send_bounce_status == BOUNCE_NOT_INITIALIZED || send_bounce_status == BOUNCE_IS_FROM_RBUF) { + if (send_bytes_per_fan * fanout < mca_coll_han_component.han_packbuf_bytes) { + send_fl_item = opal_free_list_get(&mca_coll_han_component.pack_buffers); + if (send_fl_item) { + send_bounce_status = BOUNCE_IS_FROM_FREELIST; + send_bounce = send_fl_item->ptr; + } + } + if (!send_fl_item) { + send_bounce = malloc(send_bytes_per_fan * fanout); + send_bounce_status = BOUNCE_IS_FROM_MALLOC; + } } } @@ -409,7 +426,11 @@ int mca_coll_han_alltoall_using_smsc( } } OBJ_DESTRUCT(&convertor); - if (send_bounce_is_allocated) free(send_bounce); + if (send_bounce_status == BOUNCE_IS_FROM_FREELIST) { + opal_free_list_return(&mca_coll_han_component.pack_buffers, send_fl_item); + } else if (send_bounce_status == BOUNCE_IS_FROM_MALLOC) { + free(send_bounce); + } free(inter_send_reqs); free(inter_recv_reqs); free(sbuf_map_ctx);