Skip to content

Commit 3ab3371

Browse files
committed
Add CUDA/HIP implementations of reduction operators
The operators are generated from macros. Function pointers to kernel launch functions are stored inside the ompi_op_t as a pointer to a struct that is filled if accelerator support is available. The ompi_op* API is extended to include versions taking streams and device IDs to allow enqueuing operators on streams. The old functions map to the stream versions with a NULL stream. Signed-off-by: Joseph Schuchart <[email protected]>
1 parent 55c0bda commit 3ab3371

24 files changed

+8708
-48
lines changed

config/opal_check_cudart.m4

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
dnl -*- autoconf -*-
2+
dnl
3+
dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4+
dnl University Research and Technology
5+
dnl Corporation. All rights reserved.
6+
dnl Copyright (c) 2004-2005 The University of Tennessee and The University
7+
dnl of Tennessee Research Foundation. All rights
8+
dnl reserved.
9+
dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10+
dnl University of Stuttgart. All rights reserved.
11+
dnl Copyright (c) 2004-2005 The Regents of the University of California.
12+
dnl All rights reserved.
13+
dnl Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
14+
dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
15+
dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
16+
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
17+
dnl reserved.
18+
dnl Copyright (c) 2009-2011 Oak Ridge National Labs. All rights reserved.
19+
dnl Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
20+
dnl Copyright (c) 2015 Research Organization for Information Science
21+
dnl and Technology (RIST). All rights reserved.
22+
dnl Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
23+
dnl $COPYRIGHT$
24+
dnl
25+
dnl Additional copyrights may follow
26+
dnl
27+
dnl $HEADER$
28+
dnl
29+
30+
31+
# OPAL_CHECK_CUDART(prefix, [action-if-found], [action-if-not-found])
32+
# --------------------------------------------------------
33+
# check if CUDA runtime library support can be found. sets prefix_{CPPFLAGS,
34+
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
35+
# support, otherwise executes action-if-not-found
36+
37+
#
38+
# Check for CUDA support
39+
#
40+
AC_DEFUN([OPAL_CHECK_CUDART],[
41+
OPAL_VAR_SCOPE_PUSH([cudart_save_CPPFLAGS cudart_save_LDFLAGS cudart_save_LIBS])
42+
43+
cudart_save_CPPFLAGS="$CPPFLAGS"
44+
cudart_save_LDFLAGS="$LDFLAGS"
45+
cudart_save_LIBS="$LIBS"
46+
47+
#
48+
# Check to see if the user provided paths for CUDART
49+
#
50+
AC_ARG_WITH([cudart],
51+
[AS_HELP_STRING([--with-cudart=DIR],
52+
[Path to the CUDA runtime library and header files])])
53+
AC_MSG_CHECKING([if --with-cudart is set])
54+
AC_ARG_WITH([cudart-libdir],
55+
[AS_HELP_STRING([--with-cudart-libdir=DIR],
56+
[Search for CUDA runtime libraries in DIR])])
57+
58+
####################################
59+
#### Check for CUDA runtime library
60+
####################################
61+
AS_IF([test "x$with_cudart" != "xno" || test "x$with_cudart" = "x"],
62+
[opal_check_cudart_happy=no
63+
AC_MSG_RESULT([not set (--with-cudart=$with_cudart)])],
64+
[AS_IF([test ! -d "$with_cudart"],
65+
[AC_MSG_RESULT([not found])
66+
AC_MSG_WARN([Directory $with_cudart not found])]
67+
[AS_IF([test "x`ls $with_cudart/include/cuda_runtime.h 2> /dev/null`" = "x"]
68+
[AC_MSG_RESULT([not found])
69+
AC_MSG_WARN([Could not find cuda_runtime.h in $with_cudart/include])]
70+
[opal_check_cudart_happy=yes
71+
opal_cudart_incdir="$with_cudart/include"])])])
72+
73+
AS_IF([test "$opal_check_cudart_happy" = "no" && test "$with_cudart" != "no"],
74+
[AC_PATH_PROG([nvcc_bin], [nvcc], ["not-found"])
75+
AS_IF([test "$nvcc_bin" = "not-found"],
76+
[AC_MSG_WARN([Could not find nvcc binary])],
77+
[nvcc_dirname=`AS_DIRNAME([$nvcc_bin])`
78+
with_cudart=$nvcc_dirname/../
79+
opal_cudart_incdir=$nvcc_dirname/../include
80+
opal_check_cudart_happy=yes])
81+
]
82+
[])
83+
84+
AS_IF([test x"$with_cudart_libdir" = "x"],
85+
[with_cudart_libdir=$with_cudart/lib64/]
86+
[])
87+
88+
AS_IF([test "$opal_check_cudart_happy" = "yes"],
89+
[OAC_CHECK_PACKAGE([cudart],
90+
[$1],
91+
[cuda_runtime.h],
92+
[cudart],
93+
[cudaMalloc],
94+
[opal_check_cudart_happy="yes"],
95+
[opal_check_cudart_happy="no"])],
96+
[])
97+
98+
99+
AC_MSG_CHECKING([if have cuda runtime library support])
100+
if test "$opal_check_cudart_happy" = "yes"; then
101+
AC_MSG_RESULT([yes (-I$opal_cudart_incdir)])
102+
CUDART_SUPPORT=1
103+
common_cudart_CPPFLAGS="-I$opal_cudart_incdir"
104+
AC_SUBST([common_cudart_CPPFLAGS])
105+
else
106+
AC_MSG_RESULT([no])
107+
CUDART_SUPPORT=0
108+
fi
109+
110+
111+
OPAL_SUMMARY_ADD([Accelerators], [CUDART support], [], [$opal_check_cudart_happy])
112+
AM_CONDITIONAL([OPAL_cudart_support], [test "x$CUDART_SUPPORT" = "x1"])
113+
AC_DEFINE_UNQUOTED([OPAL_CUDART_SUPPORT],$CUDART_SUPPORT,
114+
[Whether we have cuda runtime library support])
115+
116+
CPPFLAGS=${cudart_save_CPPFLAGS}
117+
LDFLAGS=${cudart_save_LDFLAGS}
118+
LIBS=${cudart_save_LIBS}
119+
OPAL_VAR_SCOPE_POP
120+
])dnl

ompi/mca/op/base/op_base_frame.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2005 The University of Tennessee and The University
5+
* Copyright (c) 2004-2023 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -42,6 +42,7 @@ static void module_constructor(ompi_op_base_module_t *m)
4242
{
4343
m->opm_enable = NULL;
4444
m->opm_op = NULL;
45+
m->opm_device_enabled = false;
4546
memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
4647
memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
4748
}
@@ -50,6 +51,7 @@ static void module_constructor_1_0_0(ompi_op_base_module_1_0_0_t *m)
5051
{
5152
m->opm_enable = NULL;
5253
m->opm_op = NULL;
54+
m->opm_device_enabled = false;
5355
memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
5456
memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
5557
}

ompi/mca/op/base/op_base_op_select.c

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2009 The University of Tennessee and The University
6+
* Copyright (c) 2004-2023 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -152,22 +152,50 @@ int ompi_op_base_op_select(ompi_op_t *op)
152152
}
153153

154154
/* Copy over the non-NULL pointers */
155-
for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
156-
/* 2-buffer variants */
157-
if (NULL != avail->ao_module->opm_fns[i]) {
158-
OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
159-
op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
160-
op->o_func.intrinsic.modules[i] = avail->ao_module;
161-
OBJ_RETAIN(avail->ao_module);
155+
if (avail->ao_module->opm_device_enabled) {
156+
if (NULL == op->o_device_op) {
157+
op->o_device_op = calloc(1, sizeof(*op->o_device_op));
162158
}
163-
164-
/* 3-buffer variants */
165-
if (NULL != avail->ao_module->opm_3buff_fns[i]) {
166-
OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
167-
op->o_3buff_intrinsic.fns[i] =
168-
avail->ao_module->opm_3buff_fns[i];
169-
op->o_3buff_intrinsic.modules[i] = avail->ao_module;
170-
OBJ_RETAIN(avail->ao_module);
159+
for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
160+
/* 2-buffer variants */
161+
if (NULL != avail->ao_module->opm_stream_fns[i]) {
162+
if (NULL != op->o_device_op->do_intrinsic.modules[i]) {
163+
OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]);
164+
}
165+
op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_stream_fns[i];
166+
op->o_device_op->do_intrinsic.modules[i] = avail->ao_module;
167+
OBJ_RETAIN(avail->ao_module);
168+
}
169+
170+
/* 3-buffer variants */
171+
if (NULL != avail->ao_module->opm_3buff_stream_fns[i]) {
172+
if (NULL != op->o_device_op->do_3buff_intrinsic.modules[i]) {
173+
OBJ_RELEASE(op->o_device_op->do_3buff_intrinsic.modules[i]);
174+
}
175+
op->o_device_op->do_3buff_intrinsic.fns[i] =
176+
avail->ao_module->opm_3buff_stream_fns[i];
177+
op->o_device_op->do_3buff_intrinsic.modules[i] = avail->ao_module;
178+
OBJ_RETAIN(avail->ao_module);
179+
}
180+
}
181+
} else {
182+
for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
183+
/* 2-buffer variants */
184+
if (NULL != avail->ao_module->opm_fns[i]) {
185+
OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
186+
op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
187+
op->o_func.intrinsic.modules[i] = avail->ao_module;
188+
OBJ_RETAIN(avail->ao_module);
189+
}
190+
191+
/* 3-buffer variants */
192+
if (NULL != avail->ao_module->opm_3buff_fns[i]) {
193+
OBJ_RELEASE(op->o_3buff_intrinsic.modules[i]);
194+
op->o_3buff_intrinsic.fns[i] =
195+
avail->ao_module->opm_3buff_fns[i];
196+
op->o_3buff_intrinsic.modules[i] = avail->ao_module;
197+
OBJ_RETAIN(avail->ao_module);
198+
}
171199
}
172200
}
173201

ompi/mca/op/cuda/Makefile.am

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#
2+
# Copyright (c) 2023 The University of Tennessee and The University
3+
# of Tennessee Research Foundation. All rights
4+
# reserved.
5+
# $COPYRIGHT$
6+
#
7+
# Additional copyrights may follow
8+
#
9+
# $HEADER$
10+
#
11+
12+
# This component provides support for offloading reduce ops to CUDA devices.
13+
#
14+
# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
15+
# for more details on how to make Open MPI components.
16+
17+
# First, list all .h and .c sources. It is necessary to list all .h
18+
# files so that they will be picked up in the distribution tarball.
19+
20+
AM_CPPFLAGS = $(op_cuda_CPPFLAGS) $(op_cudart_CPPFLAGS)
21+
22+
dist_ompidata_DATA = help-ompi-mca-op-cuda.txt
23+
24+
sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
25+
#sources_extended = op_cuda_functions.cu
26+
cu_sources = op_cuda_impl.cu
27+
28+
NVCC = nvcc -g
29+
NVCCFLAGS= --std c++17 --gpu-architecture=compute_52
30+
31+
.cu.l$(OBJEXT):
32+
$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
33+
$(LIBTOOLFLAGS) --mode=compile $(NVCC) -prefer-non-pic $(NVCCFLAGS) -Wc,-Xcompiler,-fPIC,-g -c $<
34+
35+
# -o $([email protected]:.lo)
36+
37+
# Open MPI components can be compiled two ways:
38+
#
39+
# 1. As a standalone dynamic shared object (DSO), sometimes called a
40+
# dynamically loadable library (DLL).
41+
#
42+
# 2. As a static library that is slurped up into the upper-level
43+
# libmpi library (regardless of whether libmpi is a static or dynamic
44+
# library). This is called a "Libtool convenience library".
45+
#
46+
# The component needs to create an output library in this top-level
47+
# component directory, and named either mca_<type>_<name>.la (for DSO
48+
# builds) or libmca_<type>_<name>.la (for static builds). The OMPI
49+
# build system will have set the
50+
# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
51+
# which way this component should be built.
52+
53+
if MCA_BUILD_ompi_op_cuda_DSO
54+
component_install = mca_op_cuda.la
55+
else
56+
component_install =
57+
component_noinst = libmca_op_cuda.la
58+
endif
59+
60+
# Specific information for DSO builds.
61+
#
62+
# The DSO should install itself in $(ompilibdir) (by default,
63+
# $prefix/lib/openmpi).
64+
65+
mcacomponentdir = $(ompilibdir)
66+
mcacomponent_LTLIBRARIES = $(component_install)
67+
mca_op_cuda_la_SOURCES = $(sources)
68+
mca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
69+
mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@[email protected] \
70+
$(op_cuda_LIBS) $(op_cudart_LDFLAGS) $(op_cudart_LIBS)
71+
EXTRA_mca_op_cuda_la_SOURCES = $(cu_sources)
72+
73+
# Specific information for static builds.
74+
#
75+
# Note that we *must* "noinst"; the upper-layer Makefile.am's will
76+
# slurp in the resulting .la library into libmpi.
77+
78+
noinst_LTLIBRARIES = $(component_noinst)
79+
libmca_op_cuda_la_SOURCES = $(sources)
80+
libmca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
81+
libmca_op_cuda_la_LDFLAGS = -module -avoid-version\
82+
$(op_cuda_LIBS) $(op_cudart_LDFLAGS) $(op_cudart_LIBS)
83+
EXTRA_libmca_op_cuda_la_SOURCES = $(cu_sources)
84+

ompi/mca/op/cuda/configure.m4

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# -*- shell-script -*-
2+
#
3+
# Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
4+
# Copyright (c) 2023 The University of Tennessee and The University
5+
# of Tennessee Research Foundation. All rights
6+
# reserved.
7+
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
8+
# All Rights reserved.
9+
# $COPYRIGHT$
10+
#
11+
# Additional copyrights may follow
12+
#
13+
# $HEADER$
14+
#
15+
16+
#
17+
# If CUDA support was requested, then build the CUDA support library.
18+
# This code checks makes sure the check was done earlier by the
19+
# opal_check_cuda.m4 code. It also copies the flags and libs under
20+
# opal_cuda_CPPFLAGS, opal_cuda_LDFLAGS, and opal_cuda_LIBS
21+
22+
AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
23+
24+
AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile])
25+
26+
OPAL_CHECK_CUDA([op_cuda])
27+
OPAL_CHECK_CUDART([op_cudart])
28+
29+
AS_IF([test "x$CUDA_SUPPORT" = "x1"],
30+
[$1],
31+
[$2])
32+
33+
AC_SUBST([op_cuda_CPPFLAGS])
34+
AC_SUBST([op_cuda_LDFLAGS])
35+
AC_SUBST([op_cuda_LIBS])
36+
37+
AC_SUBST([op_cudart_CPPFLAGS])
38+
AC_SUBST([op_cudart_LDFLAGS])
39+
AC_SUBST([op_cudart_LIBS])
40+
41+
])dnl
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- text -*-
2+
#
3+
# Copyright (c) 2023 The University of Tennessee and The University
4+
# of Tennessee Research Foundation. All rights
5+
# reserved.
6+
# $COPYRIGHT$
7+
#
8+
# Additional copyrights may follow
9+
#
10+
# $HEADER$
11+
#
12+
# This is the US/English help file for Open MPI's CUDA operator component
13+
#
14+
[CUDA call failed]
15+
"CUDA call %s failed: %s: %s\n"

0 commit comments

Comments
 (0)