From e8f997bf8bab657bf1fb1af59a7c1602457b48bd Mon Sep 17 00:00:00 2001 From: mark-stock Date: Thu, 1 Feb 2024 13:38:43 -0600 Subject: [PATCH 1/2] Frontier builds with hip and rccl --- scripts/Makefile_frontier | 4 ++-- test_coll/main.cpp | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/scripts/Makefile_frontier b/scripts/Makefile_frontier index 381a722..96f70ab 100644 --- a/scripts/Makefile_frontier +++ b/scripts/Makefile_frontier @@ -1,10 +1,10 @@ # ----- Make Macros ----- CXX = CC -CXXFLAGS = -std=c++14 -fopenmp -I${ROCM_PATH}/include -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 -x hip +CXXFLAGS = -DPORT_HIP -std=c++14 -fopenmp -I${ROCM_PATH}/include/rccl -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 -x hip OPTFLAGS = -O3 -LD_FLAGS = -fopenmp -L${ROCM_PATH}/lib -lamdhip64 -lrccl +LD_FLAGS = -fopenmp -L${ROCM_PATH}/lib -lamdhip64 -lrccl ${PE_MPICH_GTL_DIR_amd_gfx90a} ${PE_MPICH_GTL_LIBS_amd_gfx90a} TARGETS = CommBench OBJECTS = main.o diff --git a/test_coll/main.cpp b/test_coll/main.cpp index 10f266c..082525b 100644 --- a/test_coll/main.cpp +++ b/test_coll/main.cpp @@ -23,18 +23,23 @@ #define ROOT 0 // HEADERS - #include -// #include -// #include -// PORTS - #define PORT_CUDA +// PORTS - define here or in Makefile +// #define PORT_CUDA // #define PORT_HIP // #define PORT_SYCL // CONTROL NCCL CAPABILITY -#if defined(PORT_CUDA) || defined(PORT_HIP) +#if defined(PORT_CUDA) #define CAP_NCCL +#include +#endif +#if defined(PORT_HIP) +#define CAP_NCCL +#include +#endif +#if defined(PORT_SYCL) +#include #endif // UTILITIES From 5c2065be53e0b2de00c3c21145d41e5c909c5a2d Mon Sep 17 00:00:00 2001 From: mark-stock Date: Thu, 1 Feb 2024 13:44:07 -0600 Subject: [PATCH 2/2] message sizes consistent, corrected BW calcs --- test_coll/main.cpp | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/test_coll/main.cpp b/test_coll/main.cpp index 082525b..d3260c2 100644 --- a/test_coll/main.cpp +++ b/test_coll/main.cpp @@ -163,23 +163,24 @@ int main(int argc, char *argv[]) switch(pattern) { case gather : MPI_Gather(sendbuf_d, count, MPI_FLOAT, recvbuf_d, count, MPI_FLOAT, ROOT, MPI_COMM_WORLD); break; case scatter : MPI_Scatter(sendbuf_d, count, MPI_FLOAT, recvbuf_d, count, MPI_FLOAT, ROOT, MPI_COMM_WORLD); break; - case broadcast : MPI_Bcast(sendbuf_d, count * numproc, MPI_FLOAT, ROOT, MPI_COMM_WORLD); break; - case reduce : MPI_Reduce(sendbuf_d, recvbuf_d, count * numproc, MPI_FLOAT, MPI_SUM, ROOT, MPI_COMM_WORLD); break; + case broadcast : MPI_Bcast(sendbuf_d, count, MPI_FLOAT, ROOT, MPI_COMM_WORLD); break; + case reduce : MPI_Reduce(sendbuf_d, recvbuf_d, count, MPI_FLOAT, MPI_SUM, ROOT, MPI_COMM_WORLD); break; case alltoall : MPI_Alltoall(sendbuf_d, count, MPI_FLOAT, recvbuf_d, count, MPI_FLOAT, MPI_COMM_WORLD); break; case allgather : MPI_Allgather(sendbuf_d, count, MPI_FLOAT, recvbuf_d, count, MPI_FLOAT, MPI_COMM_WORLD); break; case reducescatter : MPI_Reduce_scatter(sendbuf_d, recvbuf_d, recvcounts, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); break; - case allreduce : MPI_Allreduce(sendbuf_d, recvbuf_d, count * numproc, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); break; + case allreduce : MPI_Allreduce(sendbuf_d, recvbuf_d, count, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); break; default : return 0; } break; #ifdef CAP_NCCL + // note reducescatter takes scalar for count, while MPI takes array of counts case test::NCCL : switch(pattern) { - case broadcast : ncclBcast(sendbuf_d, count * numproc, ncclFloat32, ROOT, comm_nccl, 0); break; - case reduce : ncclReduce(sendbuf_d, recvbuf_d, count * numproc, ncclFloat32, ncclSum, ROOT, comm_nccl, 0); break; + case broadcast : ncclBcast(sendbuf_d, count, ncclFloat32, ROOT, comm_nccl, 0); break; + case reduce : ncclReduce(sendbuf_d, recvbuf_d, count, ncclFloat32, ncclSum, ROOT, comm_nccl, 0); break; case allgather : ncclAllGather(sendbuf_d, recvbuf_d, count, ncclFloat32, comm_nccl, 0); break; case reducescatter : ncclReduceScatter(sendbuf_d, recvbuf_d, count, ncclFloat32, ncclSum, comm_nccl, 0); break; - case allreduce : ncclAllReduce(sendbuf_d, recvbuf_d, count * numproc, ncclFloat32, ncclSum, comm_nccl, 0); break; + case allreduce : ncclAllReduce(sendbuf_d, recvbuf_d, count, ncclFloat32, ncclSum, comm_nccl, 0); break; default : return 0; } #ifdef PORT_CUDA @@ -227,27 +228,27 @@ int main(int argc, char *argv[]) for(int iter = 0; iter < numiter; iter++) avgTime += times[iter]; avgTime /= numiter; - size_t data = count * sizeof(float) * numproc; + size_t data = count * sizeof(float); switch(library) { case test::MPI : switch(pattern) { - case gather : printf("MPI_Gather\n"); break; - case scatter : printf("MPI_Scatter\n"); break; - case broadcast : printf("MPI_Bcast\n"); break; - case reduce : printf("MPI_Reduce\n"); break; - case alltoall : printf("MPI_Alltoall\n"); break; - case allgather : printf("MPI_Allgather\n"); break; - case reducescatter : printf("MPI_Reduce_scatter\n"); break; - case allreduce : printf("MPI_Allreduce\n"); break; + case gather : printf("MPI_Gather\n"); data *= (numproc-1); break; + case scatter : printf("MPI_Scatter\n"); data *= (numproc-1); break; + case broadcast : printf("MPI_Bcast\n"); data *= (numproc-1); break; + case reduce : printf("MPI_Reduce\n"); data *= (numproc-1); break; + case alltoall : printf("MPI_Alltoall\n"); data *= numproc*(numproc-1); break; + case allgather : printf("MPI_Allgather\n"); data *= numproc*(numproc-1); break; + case reducescatter : printf("MPI_Reduce_scatter\n"); data *= (numproc+1)*(numproc-1); break; + case allreduce : printf("MPI_Allreduce\n"); data *= 2*(numproc-1); break; } break; #ifdef CAP_NCCL case test::NCCL : switch(pattern) { - case broadcast : printf("ncclBcast\n"); break; - case reduce : printf("ncclReduce\n"); break; - case allgather : printf("ncclAllGather\n"); break; - case reducescatter : printf("ncclReduceScatter\n"); break; - case allreduce : printf("ncclAllReduce\n"); break; + case broadcast : printf("ncclBcast\n"); data *= (numproc-1); break; + case reduce : printf("ncclReduce\n"); data *= (numproc-1); break; + case allgather : printf("ncclAllGather\n"); data *= numproc*(numproc-1); break; + case reducescatter : printf("ncclReduceScatter\n"); data *= (numproc+1)*(numproc-1); break; + case allreduce : printf("ncclAllReduce\n"); data *= 2*(numproc-1); break; } break; #endif }