Skip to content

Commit 6a85957

Browse files
committed
op/cuda: Lazily initialize the CUDA information
Signed-off-by: Joseph Schuchart <[email protected]>
1 parent 53336c3 commit 6a85957

File tree

3 files changed

+62
-38
lines changed

3 files changed

+62
-38
lines changed

ompi/mca/op/cuda/op_cuda.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ ompi_op_base_stream_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_
7575
extern
7676
ompi_op_base_3buff_stream_handler_fn_t ompi_op_cuda_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
7777

78+
void ompi_op_cuda_lazy_init();
79+
7880
END_C_DECLS
7981

8082
#endif /* MCA_OP_CUDA_EXPORT_H */

ompi/mca/op/cuda/op_cuda_component.c

Lines changed: 58 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ static struct ompi_op_base_module_1_0_0_t *
3838
cuda_component_op_query(struct ompi_op_t *op, int *priority);
3939
static int cuda_component_register(void);
4040

41+
static opal_mutex_t init_lock = OPAL_MUTEX_STATIC_INIT;
42+
static bool init_complete = false;
43+
4144
ompi_op_cuda_component_t mca_op_cuda_component = {
4245
{
4346
.opc_version = {
@@ -128,44 +131,6 @@ static int
128131
cuda_component_init_query(bool enable_progress_threads,
129132
bool enable_mpi_thread_multiple)
130133
{
131-
int num_devices;
132-
int rc;
133-
// TODO: is this init needed here?
134-
cuInit(0);
135-
CHECK(cuDeviceGetCount, (&num_devices));
136-
mca_op_cuda_component.cu_num_devices = num_devices;
137-
mca_op_cuda_component.cu_devices = (CUdevice*)malloc(num_devices*sizeof(CUdevice));
138-
mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
139-
mca_op_cuda_component.cu_max_blocks = (int*)malloc(num_devices*sizeof(int));
140-
for (int i = 0; i < num_devices; ++i) {
141-
CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
142-
rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_threads_per_block[i],
143-
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
144-
mca_op_cuda_component.cu_devices[i]);
145-
if (CUDA_SUCCESS != rc) {
146-
/* fall-back to value that should work on every device */
147-
mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
148-
}
149-
if (-1 < mca_op_cuda_component.cu_max_num_threads) {
150-
if (mca_op_cuda_component.cu_max_threads_per_block[i] >= mca_op_cuda_component.cu_max_num_threads) {
151-
mca_op_cuda_component.cu_max_threads_per_block[i] = mca_op_cuda_component.cu_max_num_threads;
152-
}
153-
}
154-
155-
rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_blocks[i],
156-
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
157-
mca_op_cuda_component.cu_devices[i]);
158-
if (CUDA_SUCCESS != rc) {
159-
/* fall-back to value that should work on every device */
160-
mca_op_cuda_component.cu_max_blocks[i] = 512;
161-
}
162-
if (-1 < mca_op_cuda_component.cu_max_num_blocks) {
163-
if (mca_op_cuda_component.cu_max_blocks[i] >= mca_op_cuda_component.cu_max_num_blocks) {
164-
mca_op_cuda_component.cu_max_blocks[i] = mca_op_cuda_component.cu_max_num_blocks;
165-
}
166-
}
167-
}
168-
169134
return OMPI_SUCCESS;
170135
}
171136

@@ -193,3 +158,58 @@ cuda_component_op_query(struct ompi_op_t *op, int *priority)
193158
*priority = 50;
194159
return (ompi_op_base_module_1_0_0_t *) module;
195160
}
161+
162+
void ompi_op_cuda_lazy_init()
163+
{
164+
/* Double checked locking to avoid having to
165+
* grab locks post lazy-initialization. */
166+
opal_atomic_rmb();
167+
if (init_complete) return;
168+
169+
OPAL_THREAD_LOCK(&init_lock);
170+
171+
if (!init_complete) {
172+
static opal_atomic_lock_t lock = OPAL_ATOMIC_LOCK_INIT;
173+
while
174+
int num_devices;
175+
int rc;
176+
// TODO: is this init needed here?
177+
cuInit(0);
178+
CHECK(cuDeviceGetCount, (&num_devices));
179+
mca_op_cuda_component.cu_num_devices = num_devices;
180+
mca_op_cuda_component.cu_devices = (CUdevice*)malloc(num_devices*sizeof(CUdevice));
181+
mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
182+
mca_op_cuda_component.cu_max_blocks = (int*)malloc(num_devices*sizeof(int));
183+
for (int i = 0; i < num_devices; ++i) {
184+
CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
185+
rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_threads_per_block[i],
186+
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
187+
mca_op_cuda_component.cu_devices[i]);
188+
if (CUDA_SUCCESS != rc) {
189+
/* fall-back to value that should work on every device */
190+
mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
191+
}
192+
if (-1 < mca_op_cuda_component.cu_max_num_threads) {
193+
if (mca_op_cuda_component.cu_max_threads_per_block[i] >= mca_op_cuda_component.cu_max_num_threads) {
194+
mca_op_cuda_component.cu_max_threads_per_block[i] = mca_op_cuda_component.cu_max_num_threads;
195+
}
196+
}
197+
198+
rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_blocks[i],
199+
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
200+
mca_op_cuda_component.cu_devices[i]);
201+
if (CUDA_SUCCESS != rc) {
202+
/* fall-back to value that should work on every device */
203+
mca_op_cuda_component.cu_max_blocks[i] = 512;
204+
}
205+
if (-1 < mca_op_cuda_component.cu_max_num_blocks) {
206+
if (mca_op_cuda_component.cu_max_blocks[i] >= mca_op_cuda_component.cu_max_num_blocks) {
207+
mca_op_cuda_component.cu_max_blocks[i] = mca_op_cuda_component.cu_max_num_blocks;
208+
}
209+
}
210+
}
211+
opal_atomic_wmb();
212+
init_complete = true;
213+
}
214+
OPAL_THREAD_UNLOCK(&init_lock);
215+
}

ompi/mca/op/cuda/op_cuda_functions.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ static inline void device_op_pre(const void *orig_source1,
5555
uint64_t target_flags = -1, source1_flags = -1, source2_flags = -1;
5656
int target_rc, source1_rc, source2_rc = -1;
5757

58+
ompi_op_cuda_lazy_init();
59+
5860
*target = orig_target;
5961
*source1 = (void*)orig_source1;
6062
if (NULL != orig_source2) {

0 commit comments

Comments
 (0)