deepmodeling
diff --git a/‎source/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎source/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/module_base/blas_connector.cpp
Lines changed: 4 additions & 4 deletions b/‎source/module_base/blas_connector.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎source/module_base/kernels/cuda/math_kernel_op_vec.cu
Lines changed: 5 additions & 5 deletions b/‎source/module_base/kernels/cuda/math_kernel_op_vec.cu
Lines changed: 5 additions & 5 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op.h
Lines changed: 2 additions & 2 deletions b/‎source/module_base/kernels/math_kernel_op.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op_vec.cpp
Lines changed: 4 additions & 4 deletions b/‎source/module_base/kernels/math_kernel_op_vec.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
Lines changed: 6 additions & 6 deletions b/‎source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
Lines changed: 6 additions & 6 deletions
diff --git a/‎source/module_base/kernels/test/math_kernel_test.cpp
Lines changed: 14 additions & 14 deletions b/‎source/module_base/kernels/test/math_kernel_test.cpp
Lines changed: 14 additions & 14 deletions
diff --git a/‎source/module_base/math_chebyshev.cpp
Lines changed: 3 additions & 0 deletions b/‎source/module_base/math_chebyshev.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎source/module_base/para_gemm.cpp
Lines changed: 0 additions & 5 deletions b/‎source/module_base/para_gemm.cpp
Lines changed: 0 additions & 5 deletions
diff --git a/‎source/module_base/parallel_device.h
Lines changed: 29 additions & 0 deletions b/‎source/module_base/parallel_device.h
Lines changed: 29 additions & 0 deletions
@@ -104,7 +104,7 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/wf_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/vnl_op.hip.cu
     module_base/kernels/rocm/math_kernel_op.hip.cu
-    module_base/kernels/rocm/math_kernel_op.hip_vec.cu
+    module_base/kernels/rocm/math_kernel_op_vec.hip.cu
     module_base/kernels/rocm/math_ylm_op.hip.cu
     module_hamilt_general/module_xc/kernels/rocm/xc_functional_op.hip.cu
   )
 
@@ -820,7 +820,7 @@ void vector_add_vector(const int& dim, float *result, const float *vector1, cons
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		ModuleBase::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::vector_add_vector_op<float, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -838,7 +838,7 @@ void vector_add_vector(const int& dim, double *result, const double *vector1, co
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		ModuleBase::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::vector_add_vector_op<double, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -856,7 +856,7 @@ void vector_add_vector(const int& dim, std::complex<float> *result, const std::c
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		ModuleBase::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::vector_add_vector_op<std::complex<float>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -874,7 +874,7 @@ void vector_add_vector(const int& dim, std::complex<double> *result, const std::
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -225,7 +225,7 @@ void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operat
 
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
 template <typename T>
-void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+void vector_add_vector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                        T* result,
                                                                                        const T* vector1,
                                                                                        const Real constant1,
@@ -314,10 +314,10 @@ template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GP
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 
-template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<float, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 
@@ -134,7 +134,7 @@ template <typename T, typename Device> struct axpy_op {
 
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
 template <typename T, typename Device>
-struct constantvector_addORsub_constantVector_op {
+struct vector_add_vector_op {
   using Real = typename GetTypeReal<T>::type;
   /// @brief result[i] = vector1[i] * constant1 + vector2[i] * constant2
   ///
@@ -315,7 +315,7 @@ template <typename T> struct vector_div_vector_op<T, base_device::DEVICE_GPU> {
 
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
 template <typename T>
-struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
+struct vector_add_vector_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
   void operator()(const int &dim, T *result,
                   const T *vector1, const Real constant1, const T *vector2,
 
@@ -92,7 +92,7 @@ struct axpy_op<T, base_device::DEVICE_CPU>
 
 
 template <typename T>
-struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_CPU>
+struct vector_add_vector_op<T, base_device::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
     void operator()(const int& dim,
@@ -167,9 +167,9 @@ template struct axpy_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct axpy_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct axpy_op<double, base_device::DEVICE_CPU>;
 
-template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct vector_add_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct vector_add_vector_op<double, base_device::DEVICE_CPU>;
+template struct vector_add_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_CPU>;
 
@@ -87,7 +87,7 @@ void vector_mul_real_op<double, base_device::DEVICE_GPU>::operator()(const int d
 {
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_real_kernel<double>),
                        dim3(block),
                        dim3(thread),
                        0,
@@ -275,7 +275,7 @@ void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operat
 
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
 template <typename T>
-void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+void vector_add_vector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                        T* result,
                                                                                        const T* vector1,
                                                                                        const Real constant1,
@@ -365,10 +365,10 @@ template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GP
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 
-template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<float, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 
@@ -75,17 +75,17 @@ class TestModuleHsolverMathKernel : public ::testing::Test
     using vector_mul_real_op_cpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_CPU>;
     using vector_mul_vector_op_cpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
     using vector_div_vector_op_cpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
-    using constantvector_addORsub_constantVector_op_cpu
-        = ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
+    using vector_add_vector_op_cpu
+        = ModuleBase::vector_add_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
     using axpy_op_cpu = ModuleBase::axpy_op<std::complex<double>, base_device::DEVICE_CPU>;
     using scal_op_cpu = ModuleBase::scal_op<double, base_device::DEVICE_CPU>;
     using gemv_op_cpu = ModuleBase::gemv_op<std::complex<double>, base_device::DEVICE_CPU>;
     // gpu operator
     using vector_mul_real_op_gpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
     using vector_mul_vector_op_gpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
     using vector_div_vector_op_gpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
-    using constantvector_addORsub_constantVector_op_gpu
-        = ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+    using vector_add_vector_op_gpu
+        = ModuleBase::vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
     using axpy_op_gpu = ModuleBase::axpy_op<std::complex<double>, base_device::DEVICE_GPU>;
     using scal_op_gpu = ModuleBase::scal_op<double, base_device::DEVICE_GPU>;
     using gemv_op_gpu = ModuleBase::gemv_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -174,12 +174,12 @@ class TestModuleHsolverMathKernel : public ::testing::Test
                                                                            {2.05256102, -1.39373474},
                                                                            {-0.10166335, -0.49934031}};
 
-    // (3) for test constantvector_addORsub_constantVector_op
+    // (3) for test vector_add_vector_op
     const double constant1 = 6.6;
     const double constant2 = 4.4;
     const std::vector<std::complex<double>> input1 = L;
     const std::vector<std::complex<double>> input2 = R;
-    const std::vector<std::complex<double>> output_constantvector_addORsub_constantVector_op
+    const std::vector<std::complex<double>> output_vector_add_vector_op
         = {{-5.05571797, -5.64586374},
            {-14.76279273, 4.05181248},
            {21.81709620, -17.11884992},
@@ -294,19 +294,19 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_cpu)
     }
 }
 
-TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_cpu)
+TEST_F(TestModuleHsolverMathKernel, vector_add_vector_op_cpu)
 {
     std::vector<std::complex<double>> output(input.size());
-    constantvector_addORsub_constantVector_op_cpu()(dim,
+    vector_add_vector_op_cpu()(dim,
                                                     output.data(),
                                                     input1.data(),
                                                     constant1,
                                                     input2.data(),
                                                     constant2);
     for (int i = 0; i < input.size(); i++)
     {
-        EXPECT_LT(fabs(output[i].imag() - output_constantvector_addORsub_constantVector_op[i].imag()), 1e-8);
-        EXPECT_LT(fabs(output[i].real() - output_constantvector_addORsub_constantVector_op[i].real()), 1e-8);
+        EXPECT_LT(fabs(output[i].imag() - output_vector_add_vector_op[i].imag()), 1e-8);
+        EXPECT_LT(fabs(output[i].real() - output_vector_add_vector_op[i].real()), 1e-8);
     }
 }
 
@@ -478,7 +478,7 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu)
     delete_memory_op()(output_dev);
 }
 
-TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gpu)
+TEST_F(TestModuleHsolverMathKernel, vector_add_vector_op_gpu)
 {
     // in CPU
     std::vector<std::complex<double>> output(input.size());
@@ -498,7 +498,7 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp
     synchronize_memory_op()(input2_dev, input2.data(), input.size());
 
     // run
-    constantvector_addORsub_constantVector_op_gpu()(dim,
+    vector_add_vector_op_gpu()(dim,
                                                     output_dev,
                                                     input1_dev,
                                                     constant1,
@@ -510,8 +510,8 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp
 
     for (int i = 0; i < input.size(); i++)
     {
-        EXPECT_LT(fabs(output[i].imag() - output_constantvector_addORsub_constantVector_op[i].imag()), 1e-8);
-        EXPECT_LT(fabs(output[i].real() - output_constantvector_addORsub_constantVector_op[i].real()), 1e-8);
+        EXPECT_LT(fabs(output[i].imag() - output_vector_add_vector_op[i].imag()), 1e-8);
+        EXPECT_LT(fabs(output[i].real() - output_vector_add_vector_op[i].real()), 1e-8);
     }
 
     delete_memory_op()(input1_dev);
 
@@ -767,6 +767,9 @@ template class Chebyshev<float>;
 #endif
 #if ((defined __CUDA) || (defined __ROCM))
 template class Chebyshev<double, base_device::DEVICE_GPU>;
+#ifdef __ENABLE_FLOAT_FFTW
+template class Chebyshev<float, base_device::DEVICE_GPU>;
+#endif
 #endif
 
 } // namespace ModuleBase
@@ -256,11 +256,6 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
             int m = colA_loc[ip];
             int size = m * LDA;
             MPI_Status status;
-#ifdef __CUDA_MPI
-            // If the memory is not set to zero, it may cause the result to be wrong when using CUDA Aware MPI
-            // I am not sure if it is due to CUDA Aware MPI or not
-            base_device::memory::set_memory_op<T, Device>()(Atmp_device, 0, size);
-#endif
             Parallel_Common::recv_dev<T, Device>(Atmp_device, size, ip, 0, col_world, &status, A_tmp_.data());
             MPI_Wait(&requests[ip], &status);
             ModuleBase::gemm_op<T, Device>()('C',
 
@@ -144,6 +144,35 @@ void reduce_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nu
 #endif
     return;
 }
+
+template <typename T, typename Device>
+void gatherv_dev(const T* sendbuf,
+                 int sendcount,
+                 T* recvbuf,
+                 const int* recvcounts,
+                 const int* displs,
+                 MPI_Comm& comm,
+                 T* tmp_sspace = nullptr,
+                 T* tmp_rspace = nullptr)
+{
+#ifdef __CUDA_MPI
+    gatherv_data(sendbuf, sendcount, recvbuf, recvcounts, displs, comm);
+#else
+    object_cpu_point<T,Device> o1, o2;
+    int size = 0;
+    MPI_Comm_size(comm, &size);
+    int gather_space = displs[size - 1] + recvcounts[size - 1];
+    T* sendbuf_cpu = o1.get(sendbuf, sendcount, tmp_sspace);
+    T* recvbuf_cpu = o2.get(recvbuf, gather_space, tmp_rspace);
+    o1.sync_d2h(sendbuf_cpu, sendbuf, sendcount);
+    gatherv_data(sendbuf_cpu, sendcount, recvbuf_cpu, recvcounts, displs, comm);
+    o2.sync_h2d(recvbuf, recvbuf_cpu, gather_space);
+    o1.del(sendbuf_cpu);
+    o2.del(recvbuf_cpu);
+#endif
+    return;
+}
+
 }
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ if(USE_ROCM)`
`104`	`104`	`module_hamilt_pw/hamilt_pwdft/kernels/rocm/wf_op.hip.cu`
`105`	`105`	`module_hamilt_pw/hamilt_pwdft/kernels/rocm/vnl_op.hip.cu`
`106`	`106`	`module_base/kernels/rocm/math_kernel_op.hip.cu`
`107`		`- module_base/kernels/rocm/math_kernel_op.hip_vec.cu`
	`107`	`+ module_base/kernels/rocm/math_kernel_op_vec.hip.cu`
`108`	`108`	`module_base/kernels/rocm/math_ylm_op.hip.cu`
`109`	`109`	`module_hamilt_general/module_xc/kernels/rocm/xc_functional_op.hip.cu`
`110`	`110`	`)`
Original file line number	Diff line number	Diff line change
`@@ -820,7 +820,7 @@ void vector_add_vector(const int& dim, float result, const float vector1, cons`
`820`	`820`	`}`
`821`	`821`	`else if (device_type == base_device::GpuDevice){`
`822`	`822`	`#ifdef __CUDA`
`823`		`- ModuleBase::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
	`823`	`+ ModuleBase::vector_add_vector_op<float, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
`824`	`824`	`#endif`
`825`	`825`	`}`
`826`	`826`	`}`
`@@ -838,7 +838,7 @@ void vector_add_vector(const int& dim, double result, const double vector1, co`
`838`	`838`	`}`
`839`	`839`	`else if (device_type == base_device::GpuDevice){`
`840`	`840`	`#ifdef __CUDA`
`841`		`- ModuleBase::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
	`841`	`+ ModuleBase::vector_add_vector_op<double, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
`842`	`842`	`#endif`
`843`	`843`	`}`
`844`	`844`	`}`
`@@ -856,7 +856,7 @@ void vector_add_vector(const int& dim, std::complex<float> *result, const std::c`
`856`	`856`	`}`
`857`	`857`	`else if (device_type == base_device::GpuDevice){`
`858`	`858`	`#ifdef __CUDA`
`859`		`- ModuleBase::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
	`859`	`+ ModuleBase::vector_add_vector_op<std::complex<float>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
`860`	`860`	`#endif`
`861`	`861`	`}`
`862`	`862`	`}`
`@@ -874,7 +874,7 @@ void vector_add_vector(const int& dim, std::complex<double> *result, const std::`
`874`	`874`	`}`
`875`	`875`	`else if (device_type == base_device::GpuDevice){`
`876`	`876`	`#ifdef __CUDA`
`877`		`- ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
	`877`	`+ ModuleBase::vector_add_vector_op<std::complex<double>, base_device::DEVICE_GPU>()(dim, result, vector1, constant1, vector2, constant2);`
`878`	`878`	`#endif`
`879`	`879`	`}`
`880`	`880`	`}`