Merge branch 'try-rectangular' of github.com:real-space/tfQMRgpu into try-rectangular

Paul F Baumeister · Paul F Baumeister · commit fd51829dbfa3 · 2023-06-27T10:19:03.000+02:00
diff --git a/example/tfqmrgpu_generate_FD_example.cxx b/example/tfqmrgpu_generate_FD_example.cxx
@@ -211,15 +211,15 @@ extern "C" {
       std::fprintf(f, "    <DataTensor type=\"%s\"", type);
       std::fprintf(f, " rank=\"3\" dimensions=\"%ld %d %d\"", nblocks, BS, BS);
       if (op.scale_data != 1) {
-          std::fprintf(f, " scale=\"%.15e\"", op.scale_data);
+          std::fprintf(f, " scale=\"%.16e\"", op.scale_data);
       } // scaling
       std::fprintf(f, ">\n");
       for (size_t iblock = 0; iblock < nblocks; ++iblock) {
           auto const block = op.blocks[iblock];
           assert(nullptr != block);
           for (int i = 0; i < BS; ++i) {
               for (int j = 0; j < BS; ++j) {
-                  std::fprintf(f, "%g ",   double(std::real(block->data[i][j])));
+                  std::fprintf(f, "%.15g ",   double(std::real(block->data[i][j])));
                   if (is_complex)
                   std::fprintf(f, " %g  ", double(std::imag(block->data[i][j])));
               } // j
@@ -313,7 +313,7 @@ extern "C" {
       assert(Dimension > 0 && Dimension < 4);
       int constexpr BS = BlockEdge * ((Dimension > 1)? BlockEdge : 1)
                                    * ((Dimension > 2)? BlockEdge : 1);
-      BlockSparseOperator<BS, int32_t> A('A'); // for nFD <= 8 the scaled stencil can be represented by int32_t
+      BlockSparseOperator<BS, int64_t> A('A'); // for nFD <= 8 the scaled stencil can be represented by int64_t
       BlockSparseOperator<BS,  int8_t> B('B'); // B only contains 0s and 1s, so the smallest data type is ok
       BlockSparseOperator<BS,   float> X('X'); // float as we do not need a high precision to compare if the solution is about right
 
@@ -380,15 +380,15 @@ extern "C" {
       if (1 == nFD) {
           // already set, no warning
       } else {
-          if (echo > 0) std::cout << "# warning nFD=" << nFD << " but only {1,4,6} implemented, set nFD=1" << std::endl;
+          if (echo > 0) std::cout << "# warning nFD=" << nFD << " but only {1,4,6,8} implemented, set nFD=1" << std::endl;
           nFD = 1;
       }
 
       { // scope: check consistency of FD coefficients
           int64_t checksum{0};
           if (echo > 2) std::cout << "# use " << nFD << " finite-difference neighbors with coefficients:" << std::endl;
           for (int iFD = 0; iFD <= nFD; ++iFD) {
-              if (echo > 2) std::printf("# %i\t%9d/%d =%16.12f\n", iFD, FDcoeff[iFD], FDdenom, FDcoeff[iFD]/double(FDdenom));
+              if (echo > 2) std::printf("# %i\t%12d/%d =%16.12f\n", iFD, FDcoeff[iFD], FDdenom, FDcoeff[iFD]/double(FDdenom));
               checksum += FDcoeff[iFD] * (1ll + (iFD > 0)); // all but the central coefficient are added with a factor 2;
           } // iFD
           if (echo > 2) std::cout << std::endl;
@@ -437,9 +437,9 @@ extern "C" {
       if (echo > 1) std::cout << "# " << nob << " nonzero stencil blocks" << std::endl;
 
       // the stencil has integer coefficients if we do not divide by the finite-difference denominator
-      std::vector<DenseBlock<BS, int32_t>> Stencil(nob);
+      std::vector<DenseBlock<BS, int64_t>> Stencil(nob);
 
-      int32_t const sub_diagonal_term = std::round(FDdenom*energy);
+      int64_t const sub_diagonal_term = std::round(FDdenom*energy);
       double const energy_used = sub_diagonal_term/double(FDdenom);
       if (echo > 1) std::printf("# use energy shift %.15e\n", energy_used);
 
diff --git a/tfQMRgpu/include/tfqmrgpu_blockmult.hxx b/tfQMRgpu/include/tfqmrgpu_blockmult.hxx
@@ -69,6 +69,7 @@
                     double_t const Aik_im = A_sk[1][iLM];
  
 //                  full_debug_printf("# %s block=%i threads=%i %i adds %g * %g for k=%i\n", __func__, blockIdx.x, iLM, jLN, Aik_re, Xkj_re, kLM);
+// std::printf("# %s Y[%i][%i][%i] += %g * %g for k=%i\n", __func__, iYmat, iLM, jLN, Aik_re, Xkj_re, kLM); // real part only
 
                     // complex multiplication, 8 Flop
                     Yij_re[ia] += Aik_re * Xkj_re - Aik_im * Xkj_im; // Real part
@@ -85,6 +86,7 @@
             auto const iLM = ilm*NA + ia;
             Y[iYmat][0][iLM][jLN] = Yij_re[ia];
             Y[iYmat][1][iLM][jLN] = Yij_im[ia];
+// std::printf("# %s Y[%i][%i][%i]= %g\n", __func__, iYmat, iLM, jLN, Y[iYmat][0][iLM][jLN]); // real part only
         } // ia
 
     } // gemmNxNf
diff --git a/tfQMRgpu/include/tfqmrgpu_blocksparse.hxx b/tfQMRgpu/include/tfqmrgpu_blocksparse.hxx
@@ -77,45 +77,64 @@ class blocksparse_action_t {
         , cudaStream_t const streamId=0
         , bool const precondition=false
     ) {
-        // how to multiply the action onto x
+        // how to multiply the action A onto x
 #ifndef HAS_NO_CUDA
-
         // CUDA version
+
+        // int  constexpr TUNE = 2; // TUNE = 2 does not launch for 16x16 and 64x64
+        // int  constexpr TUNE = 1; // TUNE = 2 does not launch for 16x16 and 64x64
+        // int  constexpr TUNE = 4; // TUNE = 4 does not work for LM==6
+        int  constexpr TUNE = ((16 == LM) || (64 == LM)) ? 4 : 2; // fix
+        dim3 const     nblocks(nnzbY, 1, 1); // number of blocks
+        dim3 constexpr threads(LN, TUNE, 1); // threads per block
     #ifdef  FULLDEBUG
-        bool constexpr show_A_X_and_Y = true;
-        if (show_A_X_and_Y) {
+    
+        { // scope: check if a kernel before this one failed
+            auto const err = cudaGetLastError();
+            if (cudaSuccess != err) {
+                auto const errString = cudaGetErrorString(err);
+                printf("[ERROR] in %s:%d cudaError \"%s\" in last kernel before gemmNxNf\n", __FILE__, __LINE__, errString);
+            } // error
+        } // scope
+    
+        printf("# [info] launch gemmNxNf <real_t=%s,LM=%d,LN=%d,LM/TUNE=%d,double_t=%s> "
+               "<<< nblocks=(%d,%d,%d), threads=(%d,%d,%d) >>>\n",
+               (8 == sizeof(real_t))?"double":"float", LM, LN, LM/TUNE, (8 == sizeof(double_t))?"double":"float",
+               nblocks.x, nblocks.y, nblocks.z, threads.x, threads.y, threads.z);
+//      printf("# [info] launch gemmNxNf(y=%p, matA_d=%p, x=%p, pairs_d=%p, starts_d=%p);\n",
+//             y, matA_d, x, pairs_d, starts_d);
+        cudaDeviceSynchronize();
+    #endif // FULLDEBUG
+
+        gemmNxNf <real_t,LM,LN,LM/TUNE,double_t> <<< nblocks, threads, 0, streamId >>> (y, matA_d, x, pairs_d, starts_d);
+
+    #ifdef  FULLDEBUG
+        // cudaDeviceSynchronize(); // necessary?
+        // auto const err = cudaGetLastError();
+        auto const err = cudaDeviceSynchronize();
+        if (cudaSuccess != err) {
+            auto const errString = cudaGetErrorString(err);
+            printf("[ERROR] in %s:%d cudaError \"%s\" after kernel call!\n", __FILE__, __LINE__, errString);
+        } else {
+            cudaDeviceSynchronize(); // necessary?
+          #ifdef  EXTREMEDEBUG
             printf("\n\n# multiply:\n");
             for(int i{0}; i < nnzbY; ++i) {
                 printf("# from [%d to %d)\n", p->starts[i], p->starts[i + 1]);
                 for(int j = p->starts[i]; j < p->starts[i + 1]; ++j) {
                     printf("#   pair %i %i\n", p->pairs[2*j], p->pairs[2*j + 1]);
                 } // j
             } // i
-            print_array<real_t, LM> <<< 1, 1, 0, streamId >>> (matA_d[0][0], p->nnzbA*2*LM, 'A');
-            print_array<uint32_t,1> <<< 1, 1, 0, streamId >>> ((uint32_t(*)[1])starts_d, nnzbY+1, 's', 'i');
+            print_array<uint32_t,1> <<< 1, 1, 0, streamId >>> ((uint32_t(*)[1])starts_d, nnzbY + 1, 's', 'i');
             print_array<uint32_t,2> <<< 1, 1, 0, streamId >>> ((uint32_t(*)[2])pairs_d, p->starts[nnzbY], 'p', 'i');
-            print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (x[0][0], nnzbY*2*LM, 'x');
-        } // show_A_X_and_Y
-    #endif // FULLDEBUG
-
-        int  constexpr TUNE = 2; // TUNE = 4 does not work for LM==6
-        dim3 constexpr threads(LN, TUNE, 1);
-        gemmNxNf <real_t,LM,LN,LM/TUNE,double_t> <<< nnzbY, threads, 0, streamId >>> (y, matA_d, x, pairs_d, starts_d);
-
-    #ifdef  FULLDEBUG
-        cudaDeviceSynchronize(); // necessary?
-        auto const err = cudaGetLastError();
-        if (cudaSuccess != err) {
-            auto const errString = cudaGetErrorString(err);
-            printf("[ERROR] in %s:%d cudaError \"%s\" after kernel call!\n", __FILE__, __LINE__, errString);
-        } // error
-
-        if (show_A_X_and_Y) {
+            print_array<real_t, LM> <<< 1, 1, 0, streamId >>> (matA_d[0][0], p->nnzbA*2*LM, 'A', 'g');
+            print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (x[0][0], nnzbY*2*LM, 'x', 'g');
             cudaDeviceSynchronize(); // necessary?
-            print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (y[0][0], nnzbY*2*LM, 'y');
+            print_array<real_t, LN> <<< 1, 1, 0, streamId >>> (y[0][0], nnzbY*2*LM, 'y', 'g');
             cudaDeviceSynchronize(); // necessary?
             printf("\n");
-        } // show_A_X_and_Y
+          #endif // EXTREMEDEBUG
+        } // true or false
     #endif // FULLDEBUG
 
 
@@ -175,7 +194,7 @@ class blocksparse_action_t {
 
 #endif // HAS_CUDA
 
-        return p->pairs.size()*.5*LM*8.*LM*LN; // returns the number of Flops: 8 per complex
+        return p->pairs.size()*.5*LM*8.*LM*LN; // returns the number of Flops or flops: 8 per complex
     } // multiply
 
     bsrsv_plan_t* get_plan() const { return p; }
diff --git a/tfQMRgpu/include/tfqmrgpu_core.hxx b/tfQMRgpu/include/tfqmrgpu_core.hxx
@@ -157,7 +157,13 @@ namespace tfqmrgpu {
 
           // ToDo: split this part into two: allocation on CPU and transfer to the CPU, can be done when setMatrix('B')
           get_data_from_gpu<double[LN]>(invBn2_h, tau, nCols, streamId, "norm2_of_B"); // inverse_norm2_of_B
-          for(auto rhs = 0; rhs < nRHSs; ++rhs) { invBn2_h[0][rhs] = 1./invBn2_h[0][rhs]; } // invert in-place on the host
+          double min_norm2{9e99}, max_norm2{-1};
+          for(auto rhs = 0; rhs < nRHSs; ++rhs) {
+              min_norm2 = std::min(min_norm2, invBn2_h[0][rhs]);
+              max_norm2 = std::max(max_norm2, invBn2_h[0][rhs]);
+              invBn2_h[0][rhs] = 1./invBn2_h[0][rhs]; // invert in-place on the host
+          } // rhs
+          std::printf("# norms of B within [%g, %g]\n", std::sqrt(min_norm2), std::sqrt(max_norm2)); // ToDo: make this debug_printf
       } // rhs_trivial
 
       tfqmrgpuStatus_t return_status{TFQMRGPU_STATUS_MAX_ITERATIONS}; // preliminary result
diff --git a/tfQMRgpu/include/tfqmrgpu_example_xml_reader.hxx b/tfQMRgpu/include/tfqmrgpu_example_xml_reader.hxx
@@ -101,9 +101,9 @@ namespace tfqmrgpu_example_xml_reader {
   } // read_sequence
 
   inline double read_in( // returns tolerance
-        bsr_t ABX[3]
-      , char const *const filename
-      , int const echo=0
+        bsr_t ABX[3] // result: complex block sparse operators
+      , char const *const filename // name of XML file
+      , int const echo=0 // log-level
   ) {
       double tolerance{0}; // init return value
       if (nullptr == filename) {
@@ -219,7 +219,7 @@ namespace tfqmrgpu_example_xml_reader {
                   assert(indirect[abx].size() == bsr.nnzb);
                   // highest_index = *std::max_element(indirect[abx].begin(), indirect[abx].end());
               } else {
-                  indirect[abx] = std::vector<unsigned>(bsr.nnzb);
+                  indirect[abx].resize(bsr.nnzb);
                   // create a trivial indirection vector, i.e. 0,1,2,3,...
                   std::iota(indirect[abx].begin(), indirect[abx].end(), 0);
               } // Indirection
@@ -229,19 +229,19 @@ namespace tfqmrgpu_example_xml_reader {
                       assert(i < bsr.nnzb);
                       ++stats[i];
                   } // i
-                  std::vector<unsigned> occurence(96, 0);
+                  std::vector<unsigned> occurrence(96, 0);
                   for (auto s : stats) {
-                      if (s >= occurence.size()) occurence.resize(s + 1);
-                      ++occurence[s];
+                      if (s >= occurrence.size()) occurrence.resize(s + 1);
+                      ++occurrence[s];
                   } // s
-                  for (int h = 0; h < occurence.size(); ++h) {
-                      if (occurence[h] > 0) {
-                          std::printf("# %s occurence[%i] = %d\n", id, h, occurence[h]);
+                  for (int h = 0; h < occurrence.size(); ++h) {
+                      if (occurrence[h] > 0) {
+                          std::printf("# %s occurrence[%i] = %d\n", id, h, occurrence[h]);
                       } // occurred at least once
                   } // h
                   if (!Indirection) {
                       // the result of std::iota or other permutations must produce each number exactly once
-                      assert(occurence[1] == bsr.nnzb);
+                      assert(occurrence[1] == bsr.nnzb);
                   } // no indirection
               } // analysis
 
@@ -266,7 +266,7 @@ namespace tfqmrgpu_example_xml_reader {
               auto const target_size = size_t(bsr.nnzb) * block2;
               auto const data = read_sequence<double>(DataTensor->value(), echo, source_size*r1c2);
               assert(data.size() == source_size*r1c2);
-              bsr.mat = std::vector<double>(target_size*2, 0.0); // always complex (in RIRIRIRI data layout)
+              bsr.mat.resize(target_size*2, 0.0); // always complex (in RIRIRIRI data layout)
               if (dims[0] < 1) {
                   std::printf("# DataTensor[%d] has no elements for operator %s\n", dims[0], id);
               } else {
diff --git a/tfQMRgpu/include/tfqmrgpu_linalg.hxx b/tfQMRgpu/include/tfqmrgpu_linalg.hxx
@@ -14,7 +14,8 @@
 #include "tfqmrgpu_plan.hxx"      // bsrsv_plan_t
 #include "tfqmrgpu_handle.hxx"    // tfq_handle_t
 
-// #define DEBUG
+#define DEBUG
+// #define FULLDEBUG
 
 #ifdef  DEBUG
     #define debug_printf(...) std::printf(__VA_ARGS__)
@@ -57,7 +58,7 @@ namespace tfqmrgpu {
                     bet[i][0][j] = 0; bet[i][1][j] = 0; // beta := 0
                     rho[i][0][j] = 0; rho[i][1][j] = 0; // rho  := 0
                     #ifdef  FULLDEBUG
-                        debug_printf("# tfQMRdec35 status[%i][%i]= -1, |z35|^2= %.1e, |rho|^2= %.1e\n", i, j, abs2z35, abs2rho);
+                        debug_printf("# tfQMRdec35[%i][%i] status= -1  |z35|^2= %.1e  |rho|^2= %.1e\n", i, j, abs2z35, abs2rho);
                     #endif // FULLDEBUG
                 } else {
                     auto const rho_denom = 1./abs2rho;
@@ -66,6 +67,10 @@ namespace tfqmrgpu {
                     bet[i][1][j] = real_t((z35_Im*rho_Re - z35_Re*rho_Im) * rho_denom);
                     // rho := z35
                     rho[i][0][j] = z35_Re; rho[i][1][j] = z35_Im;
+                    #ifdef  FULLDEBUG
+                        debug_printf("# tfQMRdec35[%i][%i] status= %i  beta= %g,%g  rho= %g,%g\n",
+                            i, j, status[i][j], bet[i][0][j], bet[i][1][j], rho[i][0][j], rho[i][1][j]);
+                    #endif // FULLDEBUG
                 }
             } // threads j
         } // blocks i
@@ -87,6 +92,7 @@ namespace tfqmrgpu {
               (status, rho, bet, z35, nCols);
     } // tfQMRdec35
 
+
     template <typename real_t, int LN>
     void __global__ tfQMRdec34_kernel( // GPU kernel, must be launched with <<< nCols, LN >>>
           int8_t       (*devPtr status)[LN] // tfQMR status (out)
@@ -118,10 +124,9 @@ namespace tfqmrgpu {
                     alf[i][0][j] = 0; alf[i][1][j] = 0; // alfa := 0
                     c67[i][0][j] = 0; c67[i][1][j] = 0; // c67 := 0
                     #ifdef  FULLDEBUG
-                        debug_printf("# tfQMRdec34 status[%i][%i]= -2, |z34|^2= %.1e, |rho|^2= %.1e\n", i, j, abs2z34, abs2rho);
+                        debug_printf("# tfQMRdec34[%i][%i] status= -2  |z34|^2= %.1e  |rho|^2= %.1e\n", i, j, abs2z34, abs2rho);
                     #endif // FULLDEBUG
                 } else {
-//                  debug_printf("# tfQMRdec34 status[%i][%i] = %i\n", i, j, status[i][j]);
                     auto const eta_Re = double(eta[i][0][j]),
                                eta_Im = double(eta[i][1][j]); // load eta
 
@@ -138,6 +143,10 @@ namespace tfqmrgpu {
                     // c67 := z34 * (var * eta / rho) = z34 * tmp, complex multiplication
                     c67[i][0][j] = real_t(z34_Re*tmp_Re - z34_Im*tmp_Im);
                     c67[i][1][j] = real_t(z34_Im*tmp_Re + z34_Re*tmp_Im);
+                    #ifdef  FULLDEBUG
+                        debug_printf("# tfQMRdec34[%i][%i] status= %i  alfa= %g,%g  c67= %g,%g\n",
+                            i, j, status[i][j], alf[i][0][j], alf[i][1][j], c67[i][0][j], c67[i][1][j]);
+                    #endif // FULLDEBUG
                 }
             } // threads j
         } // blocks i
@@ -188,20 +197,21 @@ namespace tfqmrgpu {
                 if (std::abs(Tau) > EPSILON) {
                     auto const D55 = d55[i][0][j]; // load
                     auto const Var = D55 / Tau;
-                    #ifdef  FULLDEBUG
-                        debug_printf("# component in block %i element %i has tau= %g, d55= %g, var= %g\n", i, j, Tau, D55, Var);
-                    #endif // FULLDEBUG
                     cosi = 1./(1. + Var);
                     var[i][j] = Var; // store, do we need to store var in the 1st call to decT?
                     tau[i][j] = D55 * cosi; // store
                     r67 = real_t(Var * cosi);
-                } else {
                     #ifdef  FULLDEBUG
-                        debug_printf("# component in block %i element %i has tau = 0\n", i, j);
+                        debug_printf("# tfQMRdecT[%i][%i] tau= %g  d55= %g  var= %g  cosi= %g  new tau= %g\n",
+                            i, j, Tau, D55, Var, cosi, tau[i][j]);
                     #endif // FULLDEBUG
+                } else {
                     status[i][j] = -3; // early convergence or breakdown(stagnation)
                     var[i][j] = 0; // store
                     tau[i][j] = 0; // store
+                    #ifdef  FULLDEBUG
+                        debug_printf("# tfQMRdecT[%i][%i] status= -3\n", i, j);
+                    #endif // FULLDEBUG
                 }
 
                 if (status[i][j] < 0) {
@@ -216,6 +226,9 @@ namespace tfqmrgpu {
                     c67[i][0][j] = r67;
                     c67[i][1][j] = 0; // no imaginary part given
                 }
+                #ifdef  FULLDEBUG
+                    debug_printf("# tfQMRdecT[%i][%i] eta= %g,%g  c67= %g\n", i, j, eta[i][0][j], eta[i][1][j], r67);
+                #endif // FULLDEBUG
             } // threads j
         } // blocks i
     } // tfQMRdecT_kernel
diff --git a/tfQMRgpu/include/tfqmrgpu_util.hxx b/tfQMRgpu/include/tfqmrgpu_util.hxx
@@ -108,14 +108,14 @@
 #endif // HAS_CUDA
         {
             char fmt[4] = " %f"; fmt[2] = format;
-            printf("\n# print array \'%c\' in format \"%s\" with %d rows of %d elements\n",
-                                      name,           fmt,       num,       Dim);
+            printf("\n# print array \'%c\' in format \"%s\" with %lld rows of %d elements\n",
+                                      name,            fmt,      num,         Dim);
             for(size_t i = 0; i < num; ++i) {
-                printf("# %c[%d] ", name, i);
+                printf("# %c[%lld]\t", name, i);
                 for(int d = 0; d < Dim; ++d) {
                     printf(fmt, array[i][d]);
                 } // d
-                printf(" \n");
+                printf("\n");
             } // i
         } // master
     } // print_array
diff --git a/tfQMRgpu/source/bench_tfqmrgpu.cu b/tfQMRgpu/source/bench_tfqmrgpu.cu
@@ -154,7 +154,7 @@ namespace GPUbench {
         // values come from Fortran, so we need to transpose the blocks of B
         callAndCheck(  tfqmrgpu_bsrsv_setMatrix(handle, plan, 'B', Bmat, precision, ln, lm, 't', TFQMRGPU_LAYOUT_RIRIRIRI)  )
 
-        // [optional ]step 8x: upload the values for the initial vectors X
+        // [optional] step 8x: upload the values for the initial vectors X
 
         // step 9: envoke the transpose-free Quasi Minimal Residual solver
         double solver_time = - getTime(); // start timer
@@ -196,8 +196,8 @@ namespace GPUbench {
                             )
                 std::printf("# GPU converged to %.1e in %d iterations\n", residuum_reached, iterations_needed);
                 char const fF = ('z' == (precision | IgnoreCase))? 'F' : 'f'; // 'F':double, 'f':float
-                double const TFlop = 1e-12*flops_performed;
-                double const performance = TFlop/std::max(solver_time, 1e-6);
+                auto const TFlop = 1e-12*flops_performed;
+                auto const performance = TFlop/std::max(solver_time, 1e-6);
                 std::printf("# GPU performed %.3f T%clop in %.3f seconds = %.3f T%clop/s\n",
                                             TFlop, fF, solver_time, performance, fF);
             } // maxdev
diff --git a/tfQMRgpu/source/tfqmrgpu.cu b/tfQMRgpu/source/tfqmrgpu.cu