2
2
* Elementwise operations
3
3
*/
4
4
#include " elemwise.h"
5
+ #include < stdexcept> // std::invalid_argument
5
6
6
7
#ifndef CUVEC_DISABLE_CUDA
7
8
@@ -25,38 +26,61 @@ __global__ void knlAdd(float *dst, const float *src_a, const float *src_b, const
25
26
dst[i] = src_a[i] + src_b[i];
26
27
}
27
28
29
+ template <typename T> bool onGPU (const T *data) {
30
+ cudaPointerAttributes attr;
31
+ cudaPointerGetAttributes (&attr, data);
32
+ switch (attr.type ) {
33
+ case cudaMemoryTypeDevice:
34
+ case cudaMemoryTypeManaged:
35
+ return true ;
36
+ case cudaMemoryTypeHost:
37
+ case cudaMemoryTypeUnregistered:
38
+ break ;
39
+ default :
40
+ throw std::invalid_argument (" unknown memory type" );
41
+ break ;
42
+ }
43
+ return false ;
44
+ }
45
+
28
46
#endif // CUVEC_DISABLE_CUDA
29
47
30
48
// / dst = src_num / src_div
31
49
void div (float *dst, const float *src_num, const float *src_div, const size_t N,
32
50
float zeroDivDefault) {
33
- #ifdef CUVEC_DISABLE_CUDA
51
+ #ifndef CUVEC_DISABLE_CUDA
52
+ if (onGPU (dst)) {
53
+ dim3 thrds (NUMCU_THREADS, 1 , 1 );
54
+ dim3 blcks ((N + NUMCU_THREADS - 1 ) / NUMCU_THREADS, 1 , 1 );
55
+ knlDiv<<<blcks, thrds>>> (dst, src_num, src_div, N, zeroDivDefault);
56
+ return ;
57
+ }
58
+ #endif
34
59
for (size_t i = 0 ; i < N; ++i)
35
60
dst[i] =
36
61
(src_div[i] || zeroDivDefault == FLOAT_MAX) ? src_num[i] / src_div[i] : zeroDivDefault;
37
- #else
38
- dim3 thrds (NUMCU_THREADS, 1 , 1 );
39
- dim3 blcks ((N + NUMCU_THREADS - 1 ) / NUMCU_THREADS, 1 , 1 );
40
- knlDiv<<<blcks, thrds>>> (dst, src_num, src_div, N, zeroDivDefault);
41
- #endif
42
62
}
43
63
// / dst = src_a * src_b
44
64
void mul (float *dst, const float *src_a, const float *src_b, const size_t N) {
45
- #ifdef CUVEC_DISABLE_CUDA
46
- for (size_t i = 0 ; i < N; ++i) dst[i] = src_a[i] * src_b[i];
47
- #else
48
- dim3 thrds (NUMCU_THREADS, 1 , 1 );
49
- dim3 blcks ((N + NUMCU_THREADS - 1 ) / NUMCU_THREADS, 1 , 1 );
50
- knlMul<<<blcks, thrds>>> (dst, src_a, src_b, N);
65
+ #ifndef CUVEC_DISABLE_CUDA
66
+ if (onGPU (dst)) {
67
+ dim3 thrds (NUMCU_THREADS, 1 , 1 );
68
+ dim3 blcks ((N + NUMCU_THREADS - 1 ) / NUMCU_THREADS, 1 , 1 );
69
+ knlMul<<<blcks, thrds>>> (dst, src_a, src_b, N);
70
+ return ;
71
+ }
51
72
#endif
73
+ for (size_t i = 0 ; i < N; ++i) dst[i] = src_a[i] * src_b[i];
52
74
}
53
75
// / dst = src_a + src_b
54
76
void add (float *dst, const float *src_a, const float *src_b, const size_t N) {
55
- #ifdef CUVEC_DISABLE_CUDA
56
- for (size_t i = 0 ; i < N; ++i) dst[i] = src_a[i] + src_b[i];
57
- #else
58
- dim3 thrds (NUMCU_THREADS, 1 , 1 );
59
- dim3 blcks ((N + NUMCU_THREADS - 1 ) / NUMCU_THREADS, 1 , 1 );
60
- knlAdd<<<blcks, thrds>>> (dst, src_a, src_b, N);
77
+ #ifndef CUVEC_DISABLE_CUDA
78
+ if (onGPU (dst)) {
79
+ dim3 thrds (NUMCU_THREADS, 1 , 1 );
80
+ dim3 blcks ((N + NUMCU_THREADS - 1 ) / NUMCU_THREADS, 1 , 1 );
81
+ knlAdd<<<blcks, thrds>>> (dst, src_a, src_b, N);
82
+ return ;
83
+ }
61
84
#endif
85
+ for (size_t i = 0 ; i < N; ++i) dst[i] = src_a[i] + src_b[i];
62
86
}
0 commit comments