Skip to content

Commit fd51829

Browse files
author
Paul F Baumeister
committed
Merge branch 'try-rectangular' of github.com:real-space/tfQMRgpu into try-rectangular
2 parents 06e2859 + 2585c48 commit fd51829

9 files changed

+105
-62
lines changed

example/tfqmrgpu_generate_FD_example.cxx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -211,15 +211,15 @@ extern "C" {
211211
std::fprintf(f, " <DataTensor type=\"%s\"", type);
212212
std::fprintf(f, " rank=\"3\" dimensions=\"%ld %d %d\"", nblocks, BS, BS);
213213
if (op.scale_data != 1) {
214-
std::fprintf(f, " scale=\"%.15e\"", op.scale_data);
214+
std::fprintf(f, " scale=\"%.16e\"", op.scale_data);
215215
} // scaling
216216
std::fprintf(f, ">\n");
217217
for (size_t iblock = 0; iblock < nblocks; ++iblock) {
218218
auto const block = op.blocks[iblock];
219219
assert(nullptr != block);
220220
for (int i = 0; i < BS; ++i) {
221221
for (int j = 0; j < BS; ++j) {
222-
std::fprintf(f, "%g ", double(std::real(block->data[i][j])));
222+
std::fprintf(f, "%.15g ", double(std::real(block->data[i][j])));
223223
if (is_complex)
224224
std::fprintf(f, " %g ", double(std::imag(block->data[i][j])));
225225
} // j
@@ -313,7 +313,7 @@ extern "C" {
313313
assert(Dimension > 0 && Dimension < 4);
314314
int constexpr BS = BlockEdge * ((Dimension > 1)? BlockEdge : 1)
315315
* ((Dimension > 2)? BlockEdge : 1);
316-
BlockSparseOperator<BS, int32_t> A('A'); // for nFD <= 8 the scaled stencil can be represented by int32_t
316+
BlockSparseOperator<BS, int64_t> A('A'); // for nFD <= 8 the scaled stencil can be represented by int64_t
317317
BlockSparseOperator<BS, int8_t> B('B'); // B only contains 0s and 1s, so the smallest data type is ok
318318
BlockSparseOperator<BS, float> X('X'); // float as we do not need a high precision to compare if the solution is about right
319319

@@ -380,15 +380,15 @@ extern "C" {
380380
if (1 == nFD) {
381381
// already set, no warning
382382
} else {
383-
if (echo > 0) std::cout << "# warning nFD=" << nFD << " but only {1,4,6} implemented, set nFD=1" << std::endl;
383+
if (echo > 0) std::cout << "# warning nFD=" << nFD << " but only {1,4,6,8} implemented, set nFD=1" << std::endl;
384384
nFD = 1;
385385
}
386386

387387
{ // scope: check consistency of FD coefficients
388388
int64_t checksum{0};
389389
if (echo > 2) std::cout << "# use " << nFD << " finite-difference neighbors with coefficients:" << std::endl;
390390
for (int iFD = 0; iFD <= nFD; ++iFD) {
391-
if (echo > 2) std::printf("# %i\t%9d/%d =%16.12f\n", iFD, FDcoeff[iFD], FDdenom, FDcoeff[iFD]/double(FDdenom));
391+
if (echo > 2) std::printf("# %i\t%12d/%d =%16.12f\n", iFD, FDcoeff[iFD], FDdenom, FDcoeff[iFD]/double(FDdenom));
392392
checksum += FDcoeff[iFD] * (1ll + (iFD > 0)); // all but the central coefficient are added with a factor 2;
393393
} // iFD
394394
if (echo > 2) std::cout << std::endl;
@@ -437,9 +437,9 @@ extern "C" {
437437
if (echo > 1) std::cout << "# " << nob << " nonzero stencil blocks" << std::endl;
438438

439439
// the stencil has integer coefficients if we do not divide by the finite-difference denominator
440-
std::vector<DenseBlock<BS, int32_t>> Stencil(nob);
440+
std::vector<DenseBlock<BS, int64_t>> Stencil(nob);
441441

442-
int32_t const sub_diagonal_term = std::round(FDdenom*energy);
442+
int64_t const sub_diagonal_term = std::round(FDdenom*energy);
443443
double const energy_used = sub_diagonal_term/double(FDdenom);
444444
if (echo > 1) std::printf("# use energy shift %.15e\n", energy_used);
445445

tfQMRgpu/include/tfqmrgpu_blockmult.hxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
double_t const Aik_im = A_sk[1][iLM];
7070

7171
// full_debug_printf("# %s block=%i threads=%i %i adds %g * %g for k=%i\n", __func__, blockIdx.x, iLM, jLN, Aik_re, Xkj_re, kLM);
72+
// std::printf("# %s Y[%i][%i][%i] += %g * %g for k=%i\n", __func__, iYmat, iLM, jLN, Aik_re, Xkj_re, kLM); // real part only
7273

7374
// complex multiplication, 8 Flop
7475
Yij_re[ia] += Aik_re * Xkj_re - Aik_im * Xkj_im; // Real part
@@ -85,6 +86,7 @@
8586
auto const iLM = ilm*NA + ia;
8687
Y[iYmat][0][iLM][jLN] = Yij_re[ia];
8788
Y[iYmat][1][iLM][jLN] = Yij_im[ia];
89+
// std::printf("# %s Y[%i][%i][%i]= %g\n", __func__, iYmat, iLM, jLN, Y[iYmat][0][iLM][jLN]); // real part only
8890
} // ia
8991

9092
} // gemmNxNf

tfQMRgpu/include/tfqmrgpu_blocksparse.hxx

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -77,45 +77,64 @@ class blocksparse_action_t {
7777
, cudaStream_t const streamId=0
7878
, bool const precondition=false
7979
) {
80-
// how to multiply the action onto x
80+
// how to multiply the action A onto x
8181
#ifndef HAS_NO_CUDA
82-
8382
// CUDA version
83+
84+
// int constexpr TUNE = 2; // TUNE = 2 does not launch for 16x16 and 64x64
85+
// int constexpr TUNE = 1; // TUNE = 2 does not launch for 16x16 and 64x64
86+
// int constexpr TUNE = 4; // TUNE = 4 does not work for LM==6
87+
int constexpr TUNE = ((16 == LM) || (64 == LM)) ? 4 : 2; // fix
88+
dim3 const nblocks(nnzbY, 1, 1); // number of blocks
89+
dim3 constexpr threads(LN, TUNE, 1); // threads per block
8490
#ifdef FULLDEBUG
85-
bool constexpr show_A_X_and_Y = true;
86-
if (show_A_X_and_Y) {
91+
92+
{ // scope: check if a kernel before this one failed
93+
auto const err = cudaGetLastError();
94+
if (cudaSuccess != err) {
95+
auto const errString = cudaGetErrorString(err);
96+
printf("[ERROR] in %s:%d cudaError \"%s\" in last kernel before gemmNxNf\n", __FILE__, __LINE__, errString);
97+
} // error
98+
} // scope
99+
100+
printf("# [info] launch gemmNxNf <real_t=%s,LM=%d,LN=%d,LM/TUNE=%d,double_t=%s> "
101+
"<<< nblocks=(%d,%d,%d), threads=(%d,%d,%d) >>>\n",
102+
(8 == sizeof(real_t))?"double":"float", LM, LN, LM/TUNE, (8 == sizeof(double_t))?"double":"float",
103+
nblocks.x, nblocks.y, nblocks.z, threads.x, threads.y, threads.z);
104+
// printf("# [info] launch gemmNxNf(y=%p, matA_d=%p, x=%p, pairs_d=%p, starts_d=%p);\n",
105+
// y, matA_d, x, pairs_d, starts_d);
106+
cudaDeviceSynchronize();
107+
#endif // FULLDEBUG
108+
109+
gemmNxNf <real_t,LM,LN,LM/TUNE,double_t> <<< nblocks, threads, 0, streamId >>> (y, matA_d, x, pairs_d, starts_d);
110+
111+
#ifdef FULLDEBUG
112+
// cudaDeviceSynchronize(); // necessary?
113+
// auto const err = cudaGetLastError();
114+
auto const err = cudaDeviceSynchronize();
115+
if (cudaSuccess != err) {
116+
auto const errString = cudaGetErrorString(err);
117+
printf("[ERROR] in %s:%d cudaError \"%s\" after kernel call!\n", __FILE__, __LINE__, errString);
118+
} else {
119+
cudaDeviceSynchronize(); // necessary?
120+
#ifdef EXTREMEDEBUG
87121
printf("\n\n# multiply:\n");
88122
for(int i{0}; i < nnzbY; ++i) {
89123
printf("# from [%d to %d)\n", p->starts[i], p->starts[i + 1]);
90124
for(int j = p->starts[i]; j < p->starts[i + 1]; ++j) {
91125
printf("# pair %i %i\n", p->pairs[2*j], p->pairs[2*j + 1]);
92126
} // j
93127
} // i
94-
print_array<real_t, LM> <<< 1, 1, 0, streamId >>> (matA_d[0][0], p->nnzbA*2*LM, 'A');
95-
print_array<uint32_t,1> <<< 1, 1, 0, streamId >>> ((uint32_t(*)[1])starts_d, nnzbY+1, 's', 'i');
128+
print_array<uint32_t,1> <<< 1, 1, 0, streamId >>> ((uint32_t(*)[1])starts_d, nnzbY + 1, 's', 'i');
96129
print_array<uint32_t,2> <<< 1, 1, 0, streamId >>> ((uint32_t(*)[2])pairs_d, p->starts[nnzbY], 'p', 'i');
97-
print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (x[0][0], nnzbY*2*LM, 'x');
98-
} // show_A_X_and_Y
99-
#endif // FULLDEBUG
100-
101-
int constexpr TUNE = 2; // TUNE = 4 does not work for LM==6
102-
dim3 constexpr threads(LN, TUNE, 1);
103-
gemmNxNf <real_t,LM,LN,LM/TUNE,double_t> <<< nnzbY, threads, 0, streamId >>> (y, matA_d, x, pairs_d, starts_d);
104-
105-
#ifdef FULLDEBUG
106-
cudaDeviceSynchronize(); // necessary?
107-
auto const err = cudaGetLastError();
108-
if (cudaSuccess != err) {
109-
auto const errString = cudaGetErrorString(err);
110-
printf("[ERROR] in %s:%d cudaError \"%s\" after kernel call!\n", __FILE__, __LINE__, errString);
111-
} // error
112-
113-
if (show_A_X_and_Y) {
130+
print_array<real_t, LM> <<< 1, 1, 0, streamId >>> (matA_d[0][0], p->nnzbA*2*LM, 'A', 'g');
131+
print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (x[0][0], nnzbY*2*LM, 'x', 'g');
114132
cudaDeviceSynchronize(); // necessary?
115-
print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (y[0][0], nnzbY*2*LM, 'y');
133+
print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (y[0][0], nnzbY*2*LM, 'y', 'g');
116134
cudaDeviceSynchronize(); // necessary?
117135
printf("\n");
118-
} // show_A_X_and_Y
136+
#endif // EXTREMEDEBUG
137+
} // true or false
119138
#endif // FULLDEBUG
120139

121140

@@ -175,7 +194,7 @@ class blocksparse_action_t {
175194

176195
#endif // HAS_CUDA
177196

178-
return p->pairs.size()*.5*LM*8.*LM*LN; // returns the number of Flops: 8 per complex
197+
return p->pairs.size()*.5*LM*8.*LM*LN; // returns the number of Flops or flops: 8 per complex
179198
} // multiply
180199

181200
bsrsv_plan_t* get_plan() const { return p; }

tfQMRgpu/include/tfqmrgpu_core.hxx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,13 @@ namespace tfqmrgpu {
157157

158158
// ToDo: split this part into two: allocation on CPU and transfer to the CPU, can be done when setMatrix('B')
159159
get_data_from_gpu<double[LN]>(invBn2_h, tau, nCols, streamId, "norm2_of_B"); // inverse_norm2_of_B
160-
for(auto rhs = 0; rhs < nRHSs; ++rhs) { invBn2_h[0][rhs] = 1./invBn2_h[0][rhs]; } // invert in-place on the host
160+
double min_norm2{9e99}, max_norm2{-1};
161+
for(auto rhs = 0; rhs < nRHSs; ++rhs) {
162+
min_norm2 = std::min(min_norm2, invBn2_h[0][rhs]);
163+
max_norm2 = std::max(max_norm2, invBn2_h[0][rhs]);
164+
invBn2_h[0][rhs] = 1./invBn2_h[0][rhs]; // invert in-place on the host
165+
} // rhs
166+
std::printf("# norms of B within [%g, %g]\n", std::sqrt(min_norm2), std::sqrt(max_norm2)); // ToDo: make this debug_printf
161167
} // rhs_trivial
162168

163169
tfqmrgpuStatus_t return_status{TFQMRGPU_STATUS_MAX_ITERATIONS}; // preliminary result

tfQMRgpu/include/tfqmrgpu_example_xml_reader.hxx

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ namespace tfqmrgpu_example_xml_reader {
101101
} // read_sequence
102102

103103
inline double read_in( // returns tolerance
104-
bsr_t ABX[3]
105-
, char const *const filename
106-
, int const echo=0
104+
bsr_t ABX[3] // result: complex block sparse operators
105+
, char const *const filename // name of XML file
106+
, int const echo=0 // log-level
107107
) {
108108
double tolerance{0}; // init return value
109109
if (nullptr == filename) {
@@ -219,7 +219,7 @@ namespace tfqmrgpu_example_xml_reader {
219219
assert(indirect[abx].size() == bsr.nnzb);
220220
// highest_index = *std::max_element(indirect[abx].begin(), indirect[abx].end());
221221
} else {
222-
indirect[abx] = std::vector<unsigned>(bsr.nnzb);
222+
indirect[abx].resize(bsr.nnzb);
223223
// create a trivial indirection vector, i.e. 0,1,2,3,...
224224
std::iota(indirect[abx].begin(), indirect[abx].end(), 0);
225225
} // Indirection
@@ -229,19 +229,19 @@ namespace tfqmrgpu_example_xml_reader {
229229
assert(i < bsr.nnzb);
230230
++stats[i];
231231
} // i
232-
std::vector<unsigned> occurence(96, 0);
232+
std::vector<unsigned> occurrence(96, 0);
233233
for (auto s : stats) {
234-
if (s >= occurence.size()) occurence.resize(s + 1);
235-
++occurence[s];
234+
if (s >= occurrence.size()) occurrence.resize(s + 1);
235+
++occurrence[s];
236236
} // s
237-
for (int h = 0; h < occurence.size(); ++h) {
238-
if (occurence[h] > 0) {
239-
std::printf("# %s occurence[%i] = %d\n", id, h, occurence[h]);
237+
for (int h = 0; h < occurrence.size(); ++h) {
238+
if (occurrence[h] > 0) {
239+
std::printf("# %s occurrence[%i] = %d\n", id, h, occurrence[h]);
240240
} // occurred at least once
241241
} // h
242242
if (!Indirection) {
243243
// the result of std::iota or other permutations must produce each number exactly once
244-
assert(occurence[1] == bsr.nnzb);
244+
assert(occurrence[1] == bsr.nnzb);
245245
} // no indirection
246246
} // analysis
247247

@@ -266,7 +266,7 @@ namespace tfqmrgpu_example_xml_reader {
266266
auto const target_size = size_t(bsr.nnzb) * block2;
267267
auto const data = read_sequence<double>(DataTensor->value(), echo, source_size*r1c2);
268268
assert(data.size() == source_size*r1c2);
269-
bsr.mat = std::vector<double>(target_size*2, 0.0); // always complex (in RIRIRIRI data layout)
269+
bsr.mat.resize(target_size*2, 0.0); // always complex (in RIRIRIRI data layout)
270270
if (dims[0] < 1) {
271271
std::printf("# DataTensor[%d] has no elements for operator %s\n", dims[0], id);
272272
} else {

tfQMRgpu/include/tfqmrgpu_linalg.hxx

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
#include "tfqmrgpu_plan.hxx" // bsrsv_plan_t
1515
#include "tfqmrgpu_handle.hxx" // tfq_handle_t
1616

17-
// #define DEBUG
17+
#define DEBUG
18+
// #define FULLDEBUG
1819

1920
#ifdef DEBUG
2021
#define debug_printf(...) std::printf(__VA_ARGS__)
@@ -57,7 +58,7 @@ namespace tfqmrgpu {
5758
bet[i][0][j] = 0; bet[i][1][j] = 0; // beta := 0
5859
rho[i][0][j] = 0; rho[i][1][j] = 0; // rho := 0
5960
#ifdef FULLDEBUG
60-
debug_printf("# tfQMRdec35 status[%i][%i]= -1, |z35|^2= %.1e, |rho|^2= %.1e\n", i, j, abs2z35, abs2rho);
61+
debug_printf("# tfQMRdec35[%i][%i] status= -1 |z35|^2= %.1e |rho|^2= %.1e\n", i, j, abs2z35, abs2rho);
6162
#endif // FULLDEBUG
6263
} else {
6364
auto const rho_denom = 1./abs2rho;
@@ -66,6 +67,10 @@ namespace tfqmrgpu {
6667
bet[i][1][j] = real_t((z35_Im*rho_Re - z35_Re*rho_Im) * rho_denom);
6768
// rho := z35
6869
rho[i][0][j] = z35_Re; rho[i][1][j] = z35_Im;
70+
#ifdef FULLDEBUG
71+
debug_printf("# tfQMRdec35[%i][%i] status= %i beta= %g,%g rho= %g,%g\n",
72+
i, j, status[i][j], bet[i][0][j], bet[i][1][j], rho[i][0][j], rho[i][1][j]);
73+
#endif // FULLDEBUG
6974
}
7075
} // threads j
7176
} // blocks i
@@ -87,6 +92,7 @@ namespace tfqmrgpu {
8792
(status, rho, bet, z35, nCols);
8893
} // tfQMRdec35
8994

95+
9096
template <typename real_t, int LN>
9197
void __global__ tfQMRdec34_kernel( // GPU kernel, must be launched with <<< nCols, LN >>>
9298
int8_t (*devPtr status)[LN] // tfQMR status (out)
@@ -118,10 +124,9 @@ namespace tfqmrgpu {
118124
alf[i][0][j] = 0; alf[i][1][j] = 0; // alfa := 0
119125
c67[i][0][j] = 0; c67[i][1][j] = 0; // c67 := 0
120126
#ifdef FULLDEBUG
121-
debug_printf("# tfQMRdec34 status[%i][%i]= -2, |z34|^2= %.1e, |rho|^2= %.1e\n", i, j, abs2z34, abs2rho);
127+
debug_printf("# tfQMRdec34[%i][%i] status= -2 |z34|^2= %.1e |rho|^2= %.1e\n", i, j, abs2z34, abs2rho);
122128
#endif // FULLDEBUG
123129
} else {
124-
// debug_printf("# tfQMRdec34 status[%i][%i] = %i\n", i, j, status[i][j]);
125130
auto const eta_Re = double(eta[i][0][j]),
126131
eta_Im = double(eta[i][1][j]); // load eta
127132

@@ -138,6 +143,10 @@ namespace tfqmrgpu {
138143
// c67 := z34 * (var * eta / rho) = z34 * tmp, complex multiplication
139144
c67[i][0][j] = real_t(z34_Re*tmp_Re - z34_Im*tmp_Im);
140145
c67[i][1][j] = real_t(z34_Im*tmp_Re + z34_Re*tmp_Im);
146+
#ifdef FULLDEBUG
147+
debug_printf("# tfQMRdec34[%i][%i] status= %i alfa= %g,%g c67= %g,%g\n",
148+
i, j, status[i][j], alf[i][0][j], alf[i][1][j], c67[i][0][j], c67[i][1][j]);
149+
#endif // FULLDEBUG
141150
}
142151
} // threads j
143152
} // blocks i
@@ -188,20 +197,21 @@ namespace tfqmrgpu {
188197
if (std::abs(Tau) > EPSILON) {
189198
auto const D55 = d55[i][0][j]; // load
190199
auto const Var = D55 / Tau;
191-
#ifdef FULLDEBUG
192-
debug_printf("# component in block %i element %i has tau= %g, d55= %g, var= %g\n", i, j, Tau, D55, Var);
193-
#endif // FULLDEBUG
194200
cosi = 1./(1. + Var);
195201
var[i][j] = Var; // store, do we need to store var in the 1st call to decT?
196202
tau[i][j] = D55 * cosi; // store
197203
r67 = real_t(Var * cosi);
198-
} else {
199204
#ifdef FULLDEBUG
200-
debug_printf("# component in block %i element %i has tau = 0\n", i, j);
205+
debug_printf("# tfQMRdecT[%i][%i] tau= %g d55= %g var= %g cosi= %g new tau= %g\n",
206+
i, j, Tau, D55, Var, cosi, tau[i][j]);
201207
#endif // FULLDEBUG
208+
} else {
202209
status[i][j] = -3; // early convergence or breakdown(stagnation)
203210
var[i][j] = 0; // store
204211
tau[i][j] = 0; // store
212+
#ifdef FULLDEBUG
213+
debug_printf("# tfQMRdecT[%i][%i] status= -3\n", i, j);
214+
#endif // FULLDEBUG
205215
}
206216

207217
if (status[i][j] < 0) {
@@ -216,6 +226,9 @@ namespace tfqmrgpu {
216226
c67[i][0][j] = r67;
217227
c67[i][1][j] = 0; // no imaginary part given
218228
}
229+
#ifdef FULLDEBUG
230+
debug_printf("# tfQMRdecT[%i][%i] eta= %g,%g c67= %g\n", i, j, eta[i][0][j], eta[i][1][j], r67);
231+
#endif // FULLDEBUG
219232
} // threads j
220233
} // blocks i
221234
} // tfQMRdecT_kernel

tfQMRgpu/include/tfqmrgpu_util.hxx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,14 @@
108108
#endif // HAS_CUDA
109109
{
110110
char fmt[4] = " %f"; fmt[2] = format;
111-
printf("\n# print array \'%c\' in format \"%s\" with %d rows of %d elements\n",
112-
name, fmt, num, Dim);
111+
printf("\n# print array \'%c\' in format \"%s\" with %lld rows of %d elements\n",
112+
name, fmt, num, Dim);
113113
for(size_t i = 0; i < num; ++i) {
114-
printf("# %c[%d] ", name, i);
114+
printf("# %c[%lld]\t", name, i);
115115
for(int d = 0; d < Dim; ++d) {
116116
printf(fmt, array[i][d]);
117117
} // d
118-
printf(" \n");
118+
printf("\n");
119119
} // i
120120
} // master
121121
} // print_array

tfQMRgpu/source/bench_tfqmrgpu.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ namespace GPUbench {
154154
// values come from Fortran, so we need to transpose the blocks of B
155155
callAndCheck( tfqmrgpu_bsrsv_setMatrix(handle, plan, 'B', Bmat, precision, ln, lm, 't', TFQMRGPU_LAYOUT_RIRIRIRI) )
156156

157-
// [optional ]step 8x: upload the values for the initial vectors X
157+
// [optional] step 8x: upload the values for the initial vectors X
158158

159159
// step 9: envoke the transpose-free Quasi Minimal Residual solver
160160
double solver_time = - getTime(); // start timer
@@ -196,8 +196,8 @@ namespace GPUbench {
196196
)
197197
std::printf("# GPU converged to %.1e in %d iterations\n", residuum_reached, iterations_needed);
198198
char const fF = ('z' == (precision | IgnoreCase))? 'F' : 'f'; // 'F':double, 'f':float
199-
double const TFlop = 1e-12*flops_performed;
200-
double const performance = TFlop/std::max(solver_time, 1e-6);
199+
auto const TFlop = 1e-12*flops_performed;
200+
auto const performance = TFlop/std::max(solver_time, 1e-6);
201201
std::printf("# GPU performed %.3f T%clop in %.3f seconds = %.3f T%clop/s\n",
202202
TFlop, fF, solver_time, performance, fF);
203203
} // maxdev

0 commit comments

Comments
 (0)