remove trailing whitespaces

Paul F Baumeister · Paul F Baumeister · commit 764f98b665a7 · 2022-09-15T09:54:00.000+02:00
diff --git a/tfQMRgpu/include/tfqmrgpu_Fortran.h b/tfQMRgpu/include/tfqmrgpu_Fortran.h
@@ -20,7 +20,7 @@
       integer(kind=4), parameter :: TFQMRGPU_LAYOUT_RIRIRIRI = 85 !! default host layout, real and imag parts are interleaved.
       integer(kind=4), parameter :: TFQMRGPU_LAYOUT_DEFAULT  = 85 !! default Fortran data layout for complex and double complex
 
-!     !! pointer types require 64bit      
+!     !! pointer types require 64bit
       integer, parameter :: TFQMRGPU_HANDLE_KIND = 8 !! a pointer to an opaque handle
       integer, parameter :: TFQMRGPU_PLAN_KIND = 8 !! a pointer to an opaque plan object
       integer, parameter :: TFQMRGPU_PTR_KIND = 8 !! a pointer to data
diff --git a/tfQMRgpu/include/tfqmrgpu_Fortran_module.F90 b/tfQMRgpu/include/tfqmrgpu_Fortran_module.F90
@@ -54,7 +54,7 @@ module tfqmrgpu
     module procedure bsrsv_solve, &
           tfqmrgpu_bsrsv_complete
   endinterface
-  
+
   contains
 
   subroutine print_error(status, ierr)
@@ -93,10 +93,10 @@ subroutine getStream(handle, streamId, ierr)
     external :: tfqmrgpugetstream
     call tfqmrgpugetstream(handle, streamId, ierr)    
   endsubroutine ! get
-  
+
 
 #define DevPtrType integer(kind=8)
-  
+
   subroutine createWorkspace(pBuffer, pBufferSizeInBytes, ierr)
     integer(kind=4), intent(out) :: ierr ! this is the return value in the C-API
     DevPtrType, intent(inout) :: pBuffer
@@ -154,7 +154,7 @@ subroutine bsrsv_destroyPlan(handle, plan, ierr)
     external :: tfqmrgpu_bsrsv_destroyplan
     call tfqmrgpu_bsrsv_destroyplan(handle, plan, ierr)
   endsubroutine ! destroy
-    
+
   subroutine bsrsv_bufferSize(handle, plan, &
                     ldA, blockDim, ldB, RhsBlockDim, &
                     doublePrecision, pBufferSizeInBytes, ierr)
@@ -202,7 +202,7 @@ subroutine bsrsv_getBuffer(handle, plan, pBuffer, ierr)
 #ifdef  DEBUG
     write(*, '(a,":",i0,a,z0)') __FILE__, &
         __LINE__," got pBuffer = 0x",pBuffer
-#endif    
+#endif
   endsubroutine ! get
 
   subroutine bsrsv_setMatrix_c(handle, plan, var, val, ld, trans, layout, ierr)
@@ -232,7 +232,7 @@ subroutine bsrsv_setMatrix_z(handle, plan, var, val, ld, trans, layout, ierr)
     external :: tfqmrgpu_bsrsv_setmatrix_z
     call tfqmrgpu_bsrsv_setmatrix_z(handle, plan, var, val, ld, trans, layout, ierr)
   endsubroutine ! set
-    
+
   subroutine bsrsv_getMatrix_c(handle, plan, var, val, ld, trans, layout, ierr)
     !! retrieves the GPU memory buffer registered in plan.
     integer(kind=4), intent(out) :: ierr ! this is the return value in the C-API
diff --git a/tfQMRgpu/include/tfqmrgpu_example_reader.hxx b/tfQMRgpu/include/tfqmrgpu_example_reader.hxx
@@ -170,16 +170,16 @@ namespace tfqmrgpu_example_reader {
           std::cout << "# non-zeros " << avg_nzpr << " +/- " << dev_nzpr << " in " << nzpr0 << " of " << op->nRows << " rows" << std::endl;
           double dev_nzpc; double const avg_nzpc = average_and_deviation(nzpc0, nzpc1, nzpc2, &dev_nzpc);
           std::cout << "# non-zeros " << avg_nzpc << " +/- " << dev_nzpc << " in " << nzpc0 << " of " << op->nCols << " columns" << std::endl;
-          
+
           std::cout << std::endl;
       } // op
 
       auto const A = &(ABX[0]), B = &(ABX[1]), X = &(ABX[2]);
-      
+
       assert(B->nCols == nCols); // number of right hand sides
       assert(X->nCols == nCols); // number of right hand sides, redundant info, sorry
       assert(X->nRows == A->nCols); // multiplication of A*X must be well-defined
-      
+
       assert(A->nRows == A->nCols); // A is assmed to be a square operator here
       assert(A->fastBlockDim == A->slowBlockDim); // A is assmed to be a square operator here
 
@@ -208,9 +208,9 @@ namespace tfqmrgpu_example_reader {
               }
               std::cout << std::endl;
           } // 0
-          
+
       } // elongate the B operator
-      
+
       return tolerance;
   } // read_in
 
diff --git a/tfQMRgpu/include/tfqmrgpu_example_xml_reader.hxx b/tfQMRgpu/include/tfqmrgpu_example_xml_reader.hxx
@@ -60,7 +60,7 @@ namespace tfqmrgpu_example_xml_reader {
         rapidxml::xml_node<> const *node
       , char const *const child_name
       , int const echo=0
-  ) { 
+  ) {
       if (nullptr != node) {
           for (auto child = node->first_node(); child; child = child->next_sibling()) {
               if (0 == std::strcmp(child_name, child->name())) {
@@ -117,7 +117,7 @@ namespace tfqmrgpu_example_xml_reader {
 
       // create the root node
       rapidxml::xml_document<> doc;
-      
+
       if (echo > 0) std::printf("# parse file content using rapidxml\n");
       doc.parse<0>(infile.data());
 
@@ -172,7 +172,7 @@ namespace tfqmrgpu_example_xml_reader {
                   std::printf("\n# Warning! Cannot find CompressedSparseRow in SparseMatrix\n\n");
                   return 0;
               } // no csr found
-              
+
               auto const nzpr = find_child(csr, "NonzerosPerRow", echo);
               if (nzpr) {
                   int const nrows = std::atoi(find_attribute(nzpr, "rows", "0", echo));
@@ -247,7 +247,7 @@ namespace tfqmrgpu_example_xml_reader {
           } else { // SparseMatrix
               std::printf("\n# Warning! Cannot find a SparseMatrix for operator %s\n\n", id);
           } // SparseMatrix
-          
+
           auto const DataTensor = find_child(BSM, "DataTensor", echo);
           if (DataTensor) {
               scale_values[abx] = std::atof(find_attribute(DataTensor, "scale", "1", echo));
diff --git a/tfQMRgpu/include/tfqmrgpu_linalg.hxx b/tfQMRgpu/include/tfqmrgpu_linalg.hxx
@@ -22,9 +22,9 @@
 #endif // DEBUG
 
 namespace tfqmrgpu {
-   
+
     // tfQMR decision sections ////////////////////////////////////////////////////////////////////////
-    
+
 #define EPSILON 2.5e-308
 
 
@@ -36,7 +36,7 @@ namespace tfqmrgpu {
         , double const (*devPtr z35)[2][LM] // inner product v3.v5
         , uint32_t const nCols
     ) {
-#ifndef HAS_NO_CUDA      
+#ifndef HAS_NO_CUDA
         check_launch_params( { nCols, 1, 1 }, { LM, 1, 1 } );
         { int const i = blockIdx.x;
             { int const j = threadIdx.x;
@@ -81,7 +81,7 @@ namespace tfqmrgpu {
         , double const (*devPtr var)[LM] // var
         , uint32_t const nCols
     ) {
-#ifndef HAS_NO_CUDA      
+#ifndef HAS_NO_CUDA
         check_launch_params( { nCols, 1, 1 }, { LM, 1, 1 } );
         { int const i = blockIdx.x;
             { int const j = threadIdx.x;
@@ -122,8 +122,8 @@ namespace tfqmrgpu {
             } // threads j
         } // blocks i 
     } // dec34
-    
-    
+
+
     template <typename real_t, int LM>
     void __global__ tfQMRdecT( // GPU kernel, must be launched with <<< nCols, LM >>>
           int8_t       (*devPtr status)[LM] // tfQMR status
@@ -177,11 +177,11 @@ namespace tfqmrgpu {
         } // blocks i 
     } // decT
 
-    
-    
+
+
     // basis linear algebra kernels ////////////////////////////////////////////////////////////////////////
 
-    
+
     template <typename real_in_t, typename real_out_t>
     void __global__ convert_precision( // GPU kernel, must be launched with <<< { any, 1, 1 }, { any, 1, 1 } >>>
           real_out_t      (*devPtr out) // result, out
@@ -335,7 +335,7 @@ namespace tfqmrgpu {
         }
     } // transpose_blocks
 
-    
+
 #ifndef HAS_NO_CUDA
     template <typename real_t, int LM>
     void __global__ add_RHS_kernel( // GPU kernel, must be launched with <<< { any, 1, 1 }, { LM, 1, 1 } >>>
@@ -384,8 +384,8 @@ namespace tfqmrgpu {
 #endif // HAS_CUDA
     } // add_RHS
 
-    
-    
+
+
 #ifndef HAS_NO_CUDA
     template <typename real_t, int LM>
     void __global__ set_unit_blocks_kernel( // GPU kernel, must be launched with <<< { nnzb, 1, 1 }, { LM, 1, 1 } >>>
@@ -424,8 +424,8 @@ namespace tfqmrgpu {
         } // inzb
 #endif // HAS_CUDA
     } // set_unit_blocks
-    
-    
+
+
 
     // linear algebra functions ////////////////////////////////////////////////////////////////////////////////////////
 
@@ -470,7 +470,7 @@ namespace tfqmrgpu {
             if (2 == D2) {
                 dots[iput*nCols + icol][1][j] = di; // no race condition here
             } // D2
-           
+
         } // inz
 
     } // col_inner
@@ -648,8 +648,8 @@ namespace tfqmrgpu {
 
 
     // basis linear algebra level 3 kernels ////////////////////////////////////////////////////////////////////////
-    
-#ifndef HAS_NO_CUDA    
+
+#ifndef HAS_NO_CUDA
     template <typename real_t, int LM>
     void __global__ set_complex_value_kernel(
           real_t (*devPtr array)[2][LM] // 1D launch with correct size
@@ -682,8 +682,8 @@ namespace tfqmrgpu {
 #endif // HAS_CUDA
     } // set_complex_value
 
-    
-#ifndef HAS_NO_CUDA    
+
+#ifndef HAS_NO_CUDA
     template <typename real_t, int LM>
     void __global__ set_real_value_kernel(
           real_t (*devPtr array)[LM] // 1D launch with correct size
@@ -712,8 +712,8 @@ namespace tfqmrgpu {
         } // iblock
 #endif // HAS_CUDA
     } // set_real_value
-    
-    
+
+
     inline tfqmrgpuStatus_t create_random_numbers(
           float (*devPtr v3)
         , size_t const length // number of floats in v3
diff --git a/tfQMRgpu/include/tfqmrgpu_plan.hxx b/tfQMRgpu/include/tfqmrgpu_plan.hxx
@@ -5,7 +5,7 @@
 #include "tfqmrgpu_memWindow.h" // memWindow_t 
 
 struct bsrsv_plan_t {
-    
+
     char* pBuffer; // device memory buffer
 
     uint32_t nRows; // number of block rows
diff --git a/tfQMRgpu/include/tfqmrgpu_util.hxx b/tfQMRgpu/include/tfqmrgpu_util.hxx
@@ -119,6 +119,6 @@
             } // i
         } // master
     } // print_array
-    
+
     // absolute square of a complex number computed in double
     inline __host__ __device__ double abs2(double const zRe, double const zIm) { return zRe*zRe + zIm*zIm; }
diff --git a/tfQMRgpu/source/bench_tfqmrgpu.cu b/tfQMRgpu/source/bench_tfqmrgpu.cu
@@ -16,7 +16,7 @@
 
 #include "tfqmrgpu_util.hxx" // FlopChar, CCheck, copy_data_to_gpu, get_data_from_gpu
 #ifndef HAS_NO_CUDA
-    #include "tfqmrgpu_blockmult.hxx" // gemmNxNf, gemmNxNf1
+    #include "tfqmrgpu_blockmult.hxx" // gemmNxNf
 #endif // HAS_CUDA
 
 #ifdef DEBUG
@@ -69,7 +69,7 @@ namespace GPUbench {
         // step 3: register the CUDA stream in the handle
         callAndCheck(  tfqmrgpuSetStream(handle, streamId)  )
 
-        if (1) { // sanity check   
+        if (1) { // sanity check
             auto streamId_copy{streamId};
             callAndCheck(  tfqmrgpuGetStream(handle, &streamId_copy)  )
             assert(streamId == streamId_copy);
@@ -113,7 +113,7 @@ namespace GPUbench {
         // step 7: register the GPU memory buffer in the bsrsv-plan
         callAndCheck(  tfqmrgpu_bsrsv_setBuffer(handle, plan, pBuffer)  )
 
-        if (1) { // sanity check   
+        if (1) { // sanity check
             auto pBuffer_copy{pBuffer};
             callAndCheck(  tfqmrgpu_bsrsv_getBuffer(handle, plan, &pBuffer_copy)  )
             assert(pBuffer == pBuffer_copy);
@@ -143,7 +143,7 @@ namespace GPUbench {
         // compare matX and matR (the reference matrix)
         auto const sizeX = X->mat.size(); 
         std::vector<double> Xref(X->mat); // copy constructor
-        
+
         // step d: retrieve the result vectors X 
         // convert the blocks into ColMajor and RIRIRIRI to match the Fortran data layout
         callAndCheck(  tfqmrgpu_bsrsv_getMatrix(handle, plan, 'X',
@@ -539,6 +539,6 @@ int main(int const argc, char const *const argv[]) {
     std::printf("# found tolerance %g\n", tolerance);
     std::printf("# Execute %d repetitions with max. %d iterations.\n", nrep, MaxIter);
     std::printf("# requested precision = %c for LM = %d\n", flouble, ABX[0].fastBlockDim);
-    
+
     return GPUbench::benchmark_tfQMRgpu_library(ABX, tolerance, MaxIter, nrep, flouble);
 } // main
diff --git a/tfQMRgpu/source/tfqmrgpu.cu b/tfQMRgpu/source/tfqmrgpu.cu
diff --git a/tfQMRgpu/source/tfqmrgpu_Fortran_wrappers.c b/tfQMRgpu/source/tfqmrgpu_Fortran_wrappers.c