Skip to content

Commit 9f517e3

Browse files
committed
Type 3 working
1 parent 53a7c63 commit 9f517e3

File tree

5 files changed

+58
-44
lines changed

5 files changed

+58
-44
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to buil
3333
# if FINUFFT_USE_CPU is OFF, the following options are ignored
3434
set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying target architecture, defaults to -march=native")
3535
# sphinx tag (don't remove): @cmake_opts_end
36-
cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" OFF "FINUFFT_BUILD_PYTHON" OFF)
36+
cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
3737
cmake_dependent_option(FINUFFT_STATIC_LINKING "Disable static libraries in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
3838
cmake_dependent_option(FINUFFT_SHARED_LINKING "Shared should be the opposite of static linking" ON "NOT FINUFFT_STATIC_LINKING" OFF)
3939
# cmake-format: on

include/cufinufft/impl.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -713,8 +713,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
713713
thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
714714
d_plan->deconv, d_plan->deconv,
715715
[c1, c2, c3, d1, d2, d3, imasign] __host__ __device__(
716-
const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
717-
-> cuda_complex<T> {
716+
const thrust::tuple<T, T, T> tuple,
717+
cuda_complex<T> deconv) -> cuda_complex<T> {
718718
// d2 and d3 are 0 if dim < 2 and dim < 3
719719
const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
720720
c2 * (thrust::get<1>(tuple) + d2) +
@@ -747,7 +747,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
747747
int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3};
748748
cufinufft_opts t2opts = d_plan->opts;
749749
t2opts.modeord = 0;
750-
t2opts.debug = std::max(0, t2opts.debug - 1);
750+
t2opts.debug = std::max(0, t2opts.debug);
751751
t2opts.gpu_spreadinterponly = 0;
752752
// Safe to ignore the return value here?
753753
if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);

src/cuda/3d/cufinufft3d.cu

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,36 +135,39 @@ int cufinufft3d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
135135
int ier;
136136
cuda_complex<T> *d_cstart;
137137
cuda_complex<T> *d_fkstart;
138-
cuda_complex<T> *d_cbatch_start;
139138
const auto stream = d_plan->stream;
140139
for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
141140
int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
142141
d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
143142
d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N;
144-
d_cbatch_start = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M;
145-
d_plan->c = d_cbatch_start;
146-
d_plan->fk = d_plan->fw;
143+
// setting input for spreader
144+
d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M;
145+
// setting output for spreader
146+
d_plan->fk = d_plan->fw;
147147
// NOTE: fw might need to be set to 0
148148
// Step 0: pre-phase the input strengths
149149
for (int i = 0; i < blksize; i++) {
150150
thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase,
151151
d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M,
152-
d_plan->c_batch + i * d_plan->M,
153-
thrust::multiplies<cuda_complex<T>>());
152+
d_plan->c + i * d_plan->M, thrust::multiplies<cuda_complex<T>>());
154153
}
155154
// Step 1: Spread
156155
if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
156+
// now d_plan->fk = d_plan->fw contains the spread values
157157
// Step 2: Type 3 NUFFT
158+
// type 2 goes from fk to c
159+
// saving the results directly in the user output array d_fk
160+
// it needs to do blksize transforms
158161
d_plan->t2_plan->ntransf = blksize;
159162
if ((ier = cufinufft3d2_exec<T>(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier;
160163
// Step 3: deconvolve
164+
// now we need to d_fk = d_fk*d_plan->deconv
161165
for (int i = 0; i < blksize; i++) {
162166
thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv,
163167
d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N,
164168
d_fkstart + i * d_plan->N, thrust::multiplies<cuda_complex<T>>());
165169
}
166170
}
167-
168171
return 0;
169172
}
170173

test/cuda/cufinufft_setpts.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include <thrust/host_vector.h>
2020

2121
// for now, once finufft is demacroized we can test float
22-
using T = double;
22+
using test_t = double;
2323

2424
template<typename T, typename V> bool equal(V *d_vec, T *cpu, const std::size_t size) {
2525
// copy d_vec to cpu
@@ -98,15 +98,15 @@ int main() {
9898
const int N = n_modes[0] * n_modes[1] * n_modes[2];
9999
const int M = 100;
100100

101-
thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf),
102-
t(N * ntransf), u(N * ntransf);
103-
thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(N * ntransf);
101+
thrust::host_vector<test_t> x(M * ntransf), y(M * ntransf), z(M * ntransf),
102+
s(N * ntransf), t(N * ntransf), u(N * ntransf);
103+
thrust::host_vector<thrust::complex<test_t>> c(M * ntransf), fk(N * ntransf);
104104

105-
thrust::device_vector<T> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
106-
thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
105+
thrust::device_vector<test_t> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
106+
thrust::device_vector<thrust::complex<test_t>> d_c(M * ntransf), d_fk(N * ntransf);
107107

108108
std::default_random_engine eng(42);
109-
std::uniform_real_distribution<T> dist11(-1, 1);
109+
std::uniform_real_distribution<test_t> dist11(-1, 1);
110110
auto rand_util_11 = [&eng, &dist11]() {
111111
return dist11(eng);
112112
};
@@ -237,7 +237,7 @@ int main() {
237237
};
238238
// testing correctness of the plan creation
239239
// cufinufft_plan_t<float> *single_plan{nullptr};
240-
cufinufft_plan_t<T> *double_plan{nullptr};
240+
cufinufft_plan_t<test_t> *double_plan{nullptr};
241241
// test_type1(double_plan);
242242
// test_type2(double_plan);
243243
test_type3(double_plan);

test/cuda/cufinufft_type3_test.cu

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@
1818
#include <thrust/device_vector.h>
1919
#include <thrust/host_vector.h>
2020

21-
// for now, once finufft is demacroized we can test float
22-
using T = double;
23-
2421
template<typename T, typename V> bool equal(V *d_vec, T *cpu, const std::size_t size) {
2522
// copy d_vec to cpu
2623
thrust::host_vector<T> h_vec(size);
@@ -75,10 +72,10 @@ auto almost_equal(V *d_vec,
7572
assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) ==
7673
cudaSuccess);
7774
// print h_vec and cpu
78-
// for (std::size_t i = 0; i < size; ++i) {
79-
// std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i]
80-
// << '\n';
81-
// }
75+
for (std::size_t i = 0; i < size; ++i) {
76+
std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i]
77+
<< '\n';
78+
}
8279
std::cout << "relerrtwonorm: " << infnorm(h_vec.data(), cpu, size) << std::endl;
8380
// compare the l2 norm of the difference between the two vectors
8481
if (relerrtwonorm(h_vec.data(), cpu, size) < tol) {
@@ -88,32 +85,39 @@ auto almost_equal(V *d_vec,
8885
}
8986

9087
int main() {
88+
// for now, once finufft is demacroized we can test float
89+
using test_t = double;
90+
9191
// defaults. tests should shadow them to override
9292
cufinufft_opts opts;
9393
cufinufft_default_opts(&opts);
94-
opts.debug = 2;
94+
opts.debug = 2;
95+
opts.upsampfac = 1.25;
96+
opts.gpu_kerevalmeth = 1;
9597
// opts.gpu_sort = 0;
9698
finufft_opts fin_opts;
9799
finufft_default_opts(&fin_opts);
98100
fin_opts.debug = 2;
99101
fin_opts.spread_kerevalmeth = 1;
102+
fin_opts.upsampfac = 1.25;
100103
const int iflag = 1;
101104
const int ntransf = 1;
102105
const int dim = 3;
103106
const double tol = 1e-9;
104-
const int N = 1023;
107+
const int n_modes[] = {10, 5, 3};
108+
const int N = n_modes[0] * n_modes[1] * n_modes[2];
105109
const int M = 1000;
106110
const double bandwidth = 50.0;
107111

108-
thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf),
109-
t(N * ntransf), u(N * ntransf);
110-
thrust::host_vector<std::complex<T>> c(M * ntransf), fk(N * ntransf);
112+
thrust::host_vector<test_t> x(M * ntransf), y(M * ntransf), z(M * ntransf),
113+
s(N * ntransf), t(N * ntransf), u(N * ntransf);
114+
thrust::host_vector<std::complex<test_t>> c(M * ntransf), fk(N * ntransf);
111115

112-
thrust::device_vector<T> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
113-
thrust::device_vector<std::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
116+
thrust::device_vector<test_t> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
117+
thrust::device_vector<std::complex<test_t>> d_c(M * ntransf), d_fk(N * ntransf);
114118

115119
std::default_random_engine eng(42);
116-
std::uniform_real_distribution<T> dist11(-1, 1);
120+
std::uniform_real_distribution<test_t> dist11(-1, 1);
117121
auto rand_util_11 = [&eng, &dist11]() {
118122
return dist11(eng);
119123
};
@@ -161,11 +165,12 @@ int main() {
161165
cudaDeviceSynchronize();
162166

163167
const auto cpu_planer =
164-
[iflag, tol, ntransf, dim, M, N, &x, &y, &z, &s, &t, &u, &fin_opts](
168+
[iflag, tol, ntransf, dim, M, N, n_modes, &x, &y, &z, &s, &t, &u, &fin_opts](
165169
const auto type) {
166170
finufft_plan_s *plan{nullptr};
167-
assert(finufft_makeplan(
168-
type, dim, nullptr, iflag, ntransf, tol, &plan, &fin_opts) == 0);
171+
std::int64_t nl[] = {n_modes[0], n_modes[1], n_modes[2]};
172+
assert(
173+
finufft_makeplan(type, dim, nl, iflag, ntransf, tol, &plan, &fin_opts) == 0);
169174
assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(),
170175
t.data(), u.data()) == 0);
171176
return plan;
@@ -204,6 +209,7 @@ int main() {
204209
deconv_tol,
205210
M,
206211
N,
212+
n_modes,
207213
&d_x,
208214
&d_y,
209215
&d_z,
@@ -219,8 +225,8 @@ int main() {
219225
using T = typename std::remove_pointer<decltype(plan)>::type::real_t;
220226
const int type = 3;
221227
const auto cpu_plan = cpu_planer(type);
222-
assert(cufinufft_makeplan_impl<T>(type, dim, nullptr, iflag, ntransf, T(tol), &plan,
223-
&opts) == 0);
228+
assert(cufinufft_makeplan_impl<T>(type, dim, (int *)n_modes, iflag, ntransf, T(tol),
229+
&plan, &opts) == 0);
224230
assert(cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(),
225231
d_z.data().get(), N, d_s.data().get(),
226232
d_t.data().get(), d_u.data().get(), plan) == 0);
@@ -245,6 +251,11 @@ int main() {
245251
assert(equal(plan->kz, cpu_plan->Z, M));
246252
assert(equal(plan->d_s, cpu_plan->Sp, N));
247253
assert(equal(plan->d_t, cpu_plan->Tp, N));
254+
assert(plan->spopts.nspread == cpu_plan->spopts.nspread);
255+
assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac);
256+
assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta);
257+
assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth);
258+
assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c);
248259
assert(equal(plan->d_u, cpu_plan->Up, N));
249260
// NOTE:seems with infnorm we are getting at most 11 digits of precision
250261
std::cout << "prephase :\n";
@@ -258,10 +269,10 @@ int main() {
258269
c[i].imag(randm11());
259270
}
260271
d_c = c;
261-
for (int i = 0; i < N; i++) {
262-
fk[i] = {-100, -100};
263-
}
264-
d_fk = fk;
272+
// for (int i = 0; i < N; i++) {
273+
// fk[i] = {randm11(), randm11()};
274+
// }
275+
// d_fk = fk;
265276
cufinufft_execute_impl(
266277
(cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), plan);
267278
finufft_execute(cpu_plan, (std::complex<T> *)c.data(), (std::complex<T> *)fk.data());
@@ -273,7 +284,7 @@ int main() {
273284
};
274285
// testing correctness of the plan creation
275286
// cufinufft_plan_t<float> *single_plan{nullptr};
276-
cufinufft_plan_t<T> *double_plan{nullptr};
287+
cufinufft_plan_t<test_t> *double_plan{nullptr};
277288
// test_type1(double_plan);
278289
// test_type2(double_plan);
279290
test_type3(double_plan);

0 commit comments

Comments
 (0)