Skip to content

Commit 23617ce

Browse files
committed
bump version, merge pull request #8 from AMYPAD/cpu-fallback
2 parents 7743e2c + 6012fc2 commit 23617ce

File tree

1 file changed

+42
-18
lines changed

1 file changed

+42
-18
lines changed

numcu/src/elemwise.cu

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Elementwise operations
33
*/
44
#include "elemwise.h"
5+
#include <stdexcept> // std::invalid_argument
56

67
#ifndef CUVEC_DISABLE_CUDA
78

@@ -25,38 +26,61 @@ __global__ void knlAdd(float *dst, const float *src_a, const float *src_b, const
2526
dst[i] = src_a[i] + src_b[i];
2627
}
2728

29+
template <typename T> bool onGPU(const T *data) {
30+
cudaPointerAttributes attr;
31+
cudaPointerGetAttributes(&attr, data);
32+
switch (attr.type) {
33+
case cudaMemoryTypeDevice:
34+
case cudaMemoryTypeManaged:
35+
return true;
36+
case cudaMemoryTypeHost:
37+
case cudaMemoryTypeUnregistered:
38+
break;
39+
default:
40+
throw std::invalid_argument("unknown memory type");
41+
break;
42+
}
43+
return false;
44+
}
45+
2846
#endif // CUVEC_DISABLE_CUDA
2947

3048
/// dst = src_num / src_div
3149
void div(float *dst, const float *src_num, const float *src_div, const size_t N,
3250
float zeroDivDefault) {
33-
#ifdef CUVEC_DISABLE_CUDA
51+
#ifndef CUVEC_DISABLE_CUDA
52+
if (onGPU(dst)) {
53+
dim3 thrds(NUMCU_THREADS, 1, 1);
54+
dim3 blcks((N + NUMCU_THREADS - 1) / NUMCU_THREADS, 1, 1);
55+
knlDiv<<<blcks, thrds>>>(dst, src_num, src_div, N, zeroDivDefault);
56+
return;
57+
}
58+
#endif
3459
for (size_t i = 0; i < N; ++i)
3560
dst[i] =
3661
(src_div[i] || zeroDivDefault == FLOAT_MAX) ? src_num[i] / src_div[i] : zeroDivDefault;
37-
#else
38-
dim3 thrds(NUMCU_THREADS, 1, 1);
39-
dim3 blcks((N + NUMCU_THREADS - 1) / NUMCU_THREADS, 1, 1);
40-
knlDiv<<<blcks, thrds>>>(dst, src_num, src_div, N, zeroDivDefault);
41-
#endif
4262
}
4363
/// dst = src_a * src_b
4464
void mul(float *dst, const float *src_a, const float *src_b, const size_t N) {
45-
#ifdef CUVEC_DISABLE_CUDA
46-
for (size_t i = 0; i < N; ++i) dst[i] = src_a[i] * src_b[i];
47-
#else
48-
dim3 thrds(NUMCU_THREADS, 1, 1);
49-
dim3 blcks((N + NUMCU_THREADS - 1) / NUMCU_THREADS, 1, 1);
50-
knlMul<<<blcks, thrds>>>(dst, src_a, src_b, N);
65+
#ifndef CUVEC_DISABLE_CUDA
66+
if (onGPU(dst)) {
67+
dim3 thrds(NUMCU_THREADS, 1, 1);
68+
dim3 blcks((N + NUMCU_THREADS - 1) / NUMCU_THREADS, 1, 1);
69+
knlMul<<<blcks, thrds>>>(dst, src_a, src_b, N);
70+
return;
71+
}
5172
#endif
73+
for (size_t i = 0; i < N; ++i) dst[i] = src_a[i] * src_b[i];
5274
}
5375
/// dst = src_a + src_b
5476
void add(float *dst, const float *src_a, const float *src_b, const size_t N) {
55-
#ifdef CUVEC_DISABLE_CUDA
56-
for (size_t i = 0; i < N; ++i) dst[i] = src_a[i] + src_b[i];
57-
#else
58-
dim3 thrds(NUMCU_THREADS, 1, 1);
59-
dim3 blcks((N + NUMCU_THREADS - 1) / NUMCU_THREADS, 1, 1);
60-
knlAdd<<<blcks, thrds>>>(dst, src_a, src_b, N);
77+
#ifndef CUVEC_DISABLE_CUDA
78+
if (onGPU(dst)) {
79+
dim3 thrds(NUMCU_THREADS, 1, 1);
80+
dim3 blcks((N + NUMCU_THREADS - 1) / NUMCU_THREADS, 1, 1);
81+
knlAdd<<<blcks, thrds>>>(dst, src_a, src_b, N);
82+
return;
83+
}
6184
#endif
85+
for (size_t i = 0; i < N; ++i) dst[i] = src_a[i] + src_b[i];
6286
}

0 commit comments

Comments
 (0)