Skip to content

Commit 6b7796f

Browse files
authored
Feature: Make sKG support GPU (#6046)
* Feature: Make sKG support GPU * fix bug * fix GPU bug * fix compile * fix compile * fix compile of DCU * fix test * update
1 parent 2b23c18 commit 6b7796f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1073
-945
lines changed

source/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ if(USE_ROCM)
104104
module_hamilt_pw/hamilt_pwdft/kernels/rocm/wf_op.hip.cu
105105
module_hamilt_pw/hamilt_pwdft/kernels/rocm/vnl_op.hip.cu
106106
module_base/kernels/rocm/math_kernel_op.hip.cu
107-
module_base/kernels/rocm/math_kernel_op.hip_vec.cu
107+
module_base/kernels/rocm/math_kernel_op_vec.hip.cu
108108
module_base/kernels/rocm/math_ylm_op.hip.cu
109109
module_hamilt_general/module_xc/kernels/rocm/xc_functional_op.hip.cu
110110
)

source/module_base/blas_connector.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,7 @@ void vector_add_vector(const int& dim, float *result, const float *vector1, cons
820820
}
821821
else if (device_type == base_device::GpuDevice){
822822
#ifdef __CUDA
823-
ModuleBase::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
823+
ModuleBase::vector_add_vector_op<float, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
824824
#endif
825825
}
826826
}
@@ -838,7 +838,7 @@ void vector_add_vector(const int& dim, double *result, const double *vector1, co
838838
}
839839
else if (device_type == base_device::GpuDevice){
840840
#ifdef __CUDA
841-
ModuleBase::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
841+
ModuleBase::vector_add_vector_op<double, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
842842
#endif
843843
}
844844
}
@@ -856,7 +856,7 @@ void vector_add_vector(const int& dim, std::complex<float> *result, const std::c
856856
}
857857
else if (device_type == base_device::GpuDevice){
858858
#ifdef __CUDA
859-
ModuleBase::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
859+
ModuleBase::vector_add_vector_op<std::complex<float>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
860860
#endif
861861
}
862862
}
@@ -874,7 +874,7 @@ void vector_add_vector(const int& dim, std::complex<double> *result, const std::
874874
}
875875
else if (device_type == base_device::GpuDevice){
876876
#ifdef __CUDA
877-
ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
877+
ModuleBase::vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
878878
#endif
879879
}
880880
}

source/module_base/kernels/cuda/math_kernel_op_vec.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operat
225225

226226
// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
227227
template <typename T>
228-
void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
228+
void vector_add_vector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
229229
T* result,
230230
const T* vector1,
231231
const Real constant1,
@@ -314,10 +314,10 @@ template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GP
314314
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
315315
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
316316

317-
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
318-
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
319-
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
320-
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
317+
template struct vector_add_vector_op<float, base_device::DEVICE_GPU>;
318+
template struct vector_add_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
319+
template struct vector_add_vector_op<double, base_device::DEVICE_GPU>;
320+
template struct vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
321321

322322
template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
323323
template struct dot_real_op<double, base_device::DEVICE_GPU>;

source/module_base/kernels/math_kernel_op.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ template <typename T, typename Device> struct axpy_op {
134134

135135
// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
136136
template <typename T, typename Device>
137-
struct constantvector_addORsub_constantVector_op {
137+
struct vector_add_vector_op {
138138
using Real = typename GetTypeReal<T>::type;
139139
/// @brief result[i] = vector1[i] * constant1 + vector2[i] * constant2
140140
///
@@ -315,7 +315,7 @@ template <typename T> struct vector_div_vector_op<T, base_device::DEVICE_GPU> {
315315

316316
// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
317317
template <typename T>
318-
struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
318+
struct vector_add_vector_op<T, base_device::DEVICE_GPU> {
319319
using Real = typename GetTypeReal<T>::type;
320320
void operator()(const int &dim, T *result,
321321
const T *vector1, const Real constant1, const T *vector2,

source/module_base/kernels/math_kernel_op_vec.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ struct axpy_op<T, base_device::DEVICE_CPU>
9292

9393

9494
template <typename T>
95-
struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_CPU>
95+
struct vector_add_vector_op<T, base_device::DEVICE_CPU>
9696
{
9797
using Real = typename GetTypeReal<T>::type;
9898
void operator()(const int& dim,
@@ -167,9 +167,9 @@ template struct axpy_op<std::complex<float>, base_device::DEVICE_CPU>;
167167
template struct axpy_op<std::complex<double>, base_device::DEVICE_CPU>;
168168
template struct axpy_op<double, base_device::DEVICE_CPU>;
169169

170-
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
171-
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
172-
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
170+
template struct vector_add_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
171+
template struct vector_add_vector_op<double, base_device::DEVICE_CPU>;
172+
template struct vector_add_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
173173

174174
template struct dot_real_op<std::complex<float>, base_device::DEVICE_CPU>;
175175
template struct dot_real_op<std::complex<double>, base_device::DEVICE_CPU>;

source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ void vector_mul_real_op<double, base_device::DEVICE_GPU>::operator()(const int d
8787
{
8888
int thread = 1024;
8989
int block = (dim + thread - 1) / thread;
90-
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
90+
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_real_kernel<double>),
9191
dim3(block),
9292
dim3(thread),
9393
0,
@@ -275,7 +275,7 @@ void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operat
275275

276276
// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
277277
template <typename T>
278-
void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
278+
void vector_add_vector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
279279
T* result,
280280
const T* vector1,
281281
const Real constant1,
@@ -365,10 +365,10 @@ template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GP
365365
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
366366
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
367367

368-
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
369-
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
370-
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
371-
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
368+
template struct vector_add_vector_op<float, base_device::DEVICE_GPU>;
369+
template struct vector_add_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
370+
template struct vector_add_vector_op<double, base_device::DEVICE_GPU>;
371+
template struct vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
372372

373373
template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
374374
template struct dot_real_op<double, base_device::DEVICE_GPU>;

source/module_base/kernels/test/math_kernel_test.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -75,17 +75,17 @@ class TestModuleHsolverMathKernel : public ::testing::Test
7575
using vector_mul_real_op_cpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_CPU>;
7676
using vector_mul_vector_op_cpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
7777
using vector_div_vector_op_cpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
78-
using constantvector_addORsub_constantVector_op_cpu
79-
= ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
78+
using vector_add_vector_op_cpu
79+
= ModuleBase::vector_add_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
8080
using axpy_op_cpu = ModuleBase::axpy_op<std::complex<double>, base_device::DEVICE_CPU>;
8181
using scal_op_cpu = ModuleBase::scal_op<double, base_device::DEVICE_CPU>;
8282
using gemv_op_cpu = ModuleBase::gemv_op<std::complex<double>, base_device::DEVICE_CPU>;
8383
// gpu operator
8484
using vector_mul_real_op_gpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
8585
using vector_mul_vector_op_gpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
8686
using vector_div_vector_op_gpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
87-
using constantvector_addORsub_constantVector_op_gpu
88-
= ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
87+
using vector_add_vector_op_gpu
88+
= ModuleBase::vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
8989
using axpy_op_gpu = ModuleBase::axpy_op<std::complex<double>, base_device::DEVICE_GPU>;
9090
using scal_op_gpu = ModuleBase::scal_op<double, base_device::DEVICE_GPU>;
9191
using gemv_op_gpu = ModuleBase::gemv_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -174,12 +174,12 @@ class TestModuleHsolverMathKernel : public ::testing::Test
174174
{2.05256102, -1.39373474},
175175
{-0.10166335, -0.49934031}};
176176

177-
// (3) for test constantvector_addORsub_constantVector_op
177+
// (3) for test vector_add_vector_op
178178
const double constant1 = 6.6;
179179
const double constant2 = 4.4;
180180
const std::vector<std::complex<double>> input1 = L;
181181
const std::vector<std::complex<double>> input2 = R;
182-
const std::vector<std::complex<double>> output_constantvector_addORsub_constantVector_op
182+
const std::vector<std::complex<double>> output_vector_add_vector_op
183183
= {{-5.05571797, -5.64586374},
184184
{-14.76279273, 4.05181248},
185185
{21.81709620, -17.11884992},
@@ -294,19 +294,19 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_cpu)
294294
}
295295
}
296296

297-
TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_cpu)
297+
TEST_F(TestModuleHsolverMathKernel, vector_add_vector_op_cpu)
298298
{
299299
std::vector<std::complex<double>> output(input.size());
300-
constantvector_addORsub_constantVector_op_cpu()(dim,
300+
vector_add_vector_op_cpu()(dim,
301301
output.data(),
302302
input1.data(),
303303
constant1,
304304
input2.data(),
305305
constant2);
306306
for (int i = 0; i < input.size(); i++)
307307
{
308-
EXPECT_LT(fabs(output[i].imag() - output_constantvector_addORsub_constantVector_op[i].imag()), 1e-8);
309-
EXPECT_LT(fabs(output[i].real() - output_constantvector_addORsub_constantVector_op[i].real()), 1e-8);
308+
EXPECT_LT(fabs(output[i].imag() - output_vector_add_vector_op[i].imag()), 1e-8);
309+
EXPECT_LT(fabs(output[i].real() - output_vector_add_vector_op[i].real()), 1e-8);
310310
}
311311
}
312312

@@ -478,7 +478,7 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu)
478478
delete_memory_op()(output_dev);
479479
}
480480

481-
TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gpu)
481+
TEST_F(TestModuleHsolverMathKernel, vector_add_vector_op_gpu)
482482
{
483483
// in CPU
484484
std::vector<std::complex<double>> output(input.size());
@@ -498,7 +498,7 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp
498498
synchronize_memory_op()(input2_dev, input2.data(), input.size());
499499

500500
// run
501-
constantvector_addORsub_constantVector_op_gpu()(dim,
501+
vector_add_vector_op_gpu()(dim,
502502
output_dev,
503503
input1_dev,
504504
constant1,
@@ -510,8 +510,8 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp
510510

511511
for (int i = 0; i < input.size(); i++)
512512
{
513-
EXPECT_LT(fabs(output[i].imag() - output_constantvector_addORsub_constantVector_op[i].imag()), 1e-8);
514-
EXPECT_LT(fabs(output[i].real() - output_constantvector_addORsub_constantVector_op[i].real()), 1e-8);
513+
EXPECT_LT(fabs(output[i].imag() - output_vector_add_vector_op[i].imag()), 1e-8);
514+
EXPECT_LT(fabs(output[i].real() - output_vector_add_vector_op[i].real()), 1e-8);
515515
}
516516

517517
delete_memory_op()(input1_dev);

source/module_base/math_chebyshev.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,9 @@ template class Chebyshev<float>;
767767
#endif
768768
#if ((defined __CUDA) || (defined __ROCM))
769769
template class Chebyshev<double, base_device::DEVICE_GPU>;
770+
#ifdef __ENABLE_FLOAT_FFTW
771+
template class Chebyshev<float, base_device::DEVICE_GPU>;
772+
#endif
770773
#endif
771774

772775
} // namespace ModuleBase

source/module_base/para_gemm.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -256,11 +256,6 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
256256
int m = colA_loc[ip];
257257
int size = m * LDA;
258258
MPI_Status status;
259-
#ifdef __CUDA_MPI
260-
// If the memory is not set to zero, it may cause the result to be wrong when using CUDA Aware MPI
261-
// I am not sure if it is due to CUDA Aware MPI or not
262-
base_device::memory::set_memory_op<T, Device>()(Atmp_device, 0, size);
263-
#endif
264259
Parallel_Common::recv_dev<T, Device>(Atmp_device, size, ip, 0, col_world, &status, A_tmp_.data());
265260
MPI_Wait(&requests[ip], &status);
266261
ModuleBase::gemm_op<T, Device>()('C',

source/module_base/parallel_device.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,35 @@ void reduce_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nu
144144
#endif
145145
return;
146146
}
147+
148+
template <typename T, typename Device>
149+
void gatherv_dev(const T* sendbuf,
150+
int sendcount,
151+
T* recvbuf,
152+
const int* recvcounts,
153+
const int* displs,
154+
MPI_Comm& comm,
155+
T* tmp_sspace = nullptr,
156+
T* tmp_rspace = nullptr)
157+
{
158+
#ifdef __CUDA_MPI
159+
gatherv_data(sendbuf, sendcount, recvbuf, recvcounts, displs, comm);
160+
#else
161+
object_cpu_point<T,Device> o1, o2;
162+
int size = 0;
163+
MPI_Comm_size(comm, &size);
164+
int gather_space = displs[size - 1] + recvcounts[size - 1];
165+
T* sendbuf_cpu = o1.get(sendbuf, sendcount, tmp_sspace);
166+
T* recvbuf_cpu = o2.get(recvbuf, gather_space, tmp_rspace);
167+
o1.sync_d2h(sendbuf_cpu, sendbuf, sendcount);
168+
gatherv_data(sendbuf_cpu, sendcount, recvbuf_cpu, recvcounts, displs, comm);
169+
o2.sync_h2d(recvbuf, recvbuf_cpu, gather_space);
170+
o1.del(sendbuf_cpu);
171+
o2.del(recvbuf_cpu);
172+
#endif
173+
return;
174+
}
175+
147176
}
148177

149178

0 commit comments

Comments
 (0)