@@ -38,6 +38,9 @@ static struct ompi_op_base_module_1_0_0_t *
38
38
cuda_component_op_query (struct ompi_op_t * op , int * priority );
39
39
static int cuda_component_register (void );
40
40
41
+ static opal_mutex_t init_lock = OPAL_MUTEX_STATIC_INIT ;
42
+ static bool init_complete = false;
43
+
41
44
ompi_op_cuda_component_t mca_op_cuda_component = {
42
45
{
43
46
.opc_version = {
@@ -128,44 +131,6 @@ static int
128
131
cuda_component_init_query (bool enable_progress_threads ,
129
132
bool enable_mpi_thread_multiple )
130
133
{
131
- int num_devices ;
132
- int rc ;
133
- // TODO: is this init needed here?
134
- cuInit (0 );
135
- CHECK (cuDeviceGetCount , (& num_devices ));
136
- mca_op_cuda_component .cu_num_devices = num_devices ;
137
- mca_op_cuda_component .cu_devices = (CUdevice * )malloc (num_devices * sizeof (CUdevice ));
138
- mca_op_cuda_component .cu_max_threads_per_block = (int * )malloc (num_devices * sizeof (int ));
139
- mca_op_cuda_component .cu_max_blocks = (int * )malloc (num_devices * sizeof (int ));
140
- for (int i = 0 ; i < num_devices ; ++ i ) {
141
- CHECK (cuDeviceGet , (& mca_op_cuda_component .cu_devices [i ], i ));
142
- rc = cuDeviceGetAttribute (& mca_op_cuda_component .cu_max_threads_per_block [i ],
143
- CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X ,
144
- mca_op_cuda_component .cu_devices [i ]);
145
- if (CUDA_SUCCESS != rc ) {
146
- /* fall-back to value that should work on every device */
147
- mca_op_cuda_component .cu_max_threads_per_block [i ] = 512 ;
148
- }
149
- if (-1 < mca_op_cuda_component .cu_max_num_threads ) {
150
- if (mca_op_cuda_component .cu_max_threads_per_block [i ] >= mca_op_cuda_component .cu_max_num_threads ) {
151
- mca_op_cuda_component .cu_max_threads_per_block [i ] = mca_op_cuda_component .cu_max_num_threads ;
152
- }
153
- }
154
-
155
- rc = cuDeviceGetAttribute (& mca_op_cuda_component .cu_max_blocks [i ],
156
- CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X ,
157
- mca_op_cuda_component .cu_devices [i ]);
158
- if (CUDA_SUCCESS != rc ) {
159
- /* fall-back to value that should work on every device */
160
- mca_op_cuda_component .cu_max_blocks [i ] = 512 ;
161
- }
162
- if (-1 < mca_op_cuda_component .cu_max_num_blocks ) {
163
- if (mca_op_cuda_component .cu_max_blocks [i ] >= mca_op_cuda_component .cu_max_num_blocks ) {
164
- mca_op_cuda_component .cu_max_blocks [i ] = mca_op_cuda_component .cu_max_num_blocks ;
165
- }
166
- }
167
- }
168
-
169
134
return OMPI_SUCCESS ;
170
135
}
171
136
@@ -193,3 +158,58 @@ cuda_component_op_query(struct ompi_op_t *op, int *priority)
193
158
* priority = 50 ;
194
159
return (ompi_op_base_module_1_0_0_t * ) module ;
195
160
}
161
+
162
+ void ompi_op_cuda_lazy_init ()
163
+ {
164
+ /* Double checked locking to avoid having to
165
+ * grab locks post lazy-initialization. */
166
+ opal_atomic_rmb ();
167
+ if (init_complete ) return ;
168
+
169
+ OPAL_THREAD_LOCK (& init_lock );
170
+
171
+ if (!init_complete ) {
172
+ static opal_atomic_lock_t lock = OPAL_ATOMIC_LOCK_INIT ;
173
+ while
174
+ int num_devices ;
175
+ int rc ;
176
+ // TODO: is this init needed here?
177
+ cuInit (0 );
178
+ CHECK (cuDeviceGetCount , (& num_devices ));
179
+ mca_op_cuda_component .cu_num_devices = num_devices ;
180
+ mca_op_cuda_component .cu_devices = (CUdevice * )malloc (num_devices * sizeof (CUdevice ));
181
+ mca_op_cuda_component .cu_max_threads_per_block = (int * )malloc (num_devices * sizeof (int ));
182
+ mca_op_cuda_component .cu_max_blocks = (int * )malloc (num_devices * sizeof (int ));
183
+ for (int i = 0 ; i < num_devices ; ++ i ) {
184
+ CHECK (cuDeviceGet , (& mca_op_cuda_component .cu_devices [i ], i ));
185
+ rc = cuDeviceGetAttribute (& mca_op_cuda_component .cu_max_threads_per_block [i ],
186
+ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X ,
187
+ mca_op_cuda_component .cu_devices [i ]);
188
+ if (CUDA_SUCCESS != rc ) {
189
+ /* fall-back to value that should work on every device */
190
+ mca_op_cuda_component .cu_max_threads_per_block [i ] = 512 ;
191
+ }
192
+ if (-1 < mca_op_cuda_component .cu_max_num_threads ) {
193
+ if (mca_op_cuda_component .cu_max_threads_per_block [i ] >= mca_op_cuda_component .cu_max_num_threads ) {
194
+ mca_op_cuda_component .cu_max_threads_per_block [i ] = mca_op_cuda_component .cu_max_num_threads ;
195
+ }
196
+ }
197
+
198
+ rc = cuDeviceGetAttribute (& mca_op_cuda_component .cu_max_blocks [i ],
199
+ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X ,
200
+ mca_op_cuda_component .cu_devices [i ]);
201
+ if (CUDA_SUCCESS != rc ) {
202
+ /* fall-back to value that should work on every device */
203
+ mca_op_cuda_component .cu_max_blocks [i ] = 512 ;
204
+ }
205
+ if (-1 < mca_op_cuda_component .cu_max_num_blocks ) {
206
+ if (mca_op_cuda_component .cu_max_blocks [i ] >= mca_op_cuda_component .cu_max_num_blocks ) {
207
+ mca_op_cuda_component .cu_max_blocks [i ] = mca_op_cuda_component .cu_max_num_blocks ;
208
+ }
209
+ }
210
+ }
211
+ opal_atomic_wmb ();
212
+ init_complete = true;
213
+ }
214
+ OPAL_THREAD_UNLOCK (& init_lock );
215
+ }
0 commit comments