Commit 50efe30f authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

first implementation of CUBLAS-based matrix inversion

parent 9fd4dea4
Loading
Loading
Loading
Loading
+63 −0
Original line number Diff line number Diff line
@@ -163,6 +163,69 @@ m4_define(
  ]
)

m4_define(
  [M4_DETECT_CUBLAS],
  [
    pkg-config --version > /dev/null
    use_pkg_config=$?
    if test "x${CUDAFLAGS}${CUDALDFLAGS}" = "x"; then
      if test "x$use_pkg_config" = "x0"; then
        # pkg-config is available
        declare -a pkg_array=$(pkg-config --list-all | grep cublas)
        for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas > /dev/null
        result=$?
        if test "x$result" = "x0"; then
          # CUBLAS detected
          cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas)
    	CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg})
    	CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg})
        fi # end of CUBLAS runtime decision tree
        declare -a pkg_array=$(pkg-config --list-all | grep cudart)
        for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart > /dev/null
        result=$?
        if test "x$result" = "x0"; then
          # CUDA runtime detected
          cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart)
    	CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg})
    	CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg})
        fi # end of CUDA runtime decision tree
        echo $CUDALDFLAGS | grep cublas > /dev/null
        cudart_check=$?
        if test "x${cudart_check}" != "x0"; then
          CUDALDFLAGS="$CUDALDFLAGS -lcublas"
        fi
        echo $CUDALDFLAGS | grep cudart > /dev/null
        cudart_check=$?
        if test "x${cudart_check}" != "x0"; then
          CUDALDFLAGS="$CUDALDFLAGS -lcudart"
        fi
      else
        # pkg-config is not available
        if test -f /usr/local/cuda/include/cuda.h; then
          CUDAFLAGS="-I/usr/local/cuda/include"
          CUDALDFLAGS="-L/usr/local/cuda/lib64 -lcublas -lcudart"
        elif test -f /usr/include/cuda.h; then
          CUDAFLAGS="-I/usr/include"
          CUDALDFLAGS="-lcublas -lcudart"
        elif test "x$CUDA_HOME" != "x"; then
          CUDAFLAGS="-I${CUDA_HOME}/include"
          CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcublas -lcudart"
        fi
      fi # end of pkg-config decision tree
    fi # end of CUDAFLAGS user override protection
    if test "x$CUDAFLAGS" != "x" then
      # somehow CUDAFLAGS was defined
      export CUDAFLAGS
      export CUBLASFLAGS="-DUSE_CUBLAS ${CUDAFLAGS}"
    fi
    if test "x$CUDALDFLAGS" != "x" then
      # somehow CUDALDFLAGS was defined
      export CUDALDFLAGS
      export CUBLASLDFLAGS="${CUDALDFLAGS}"
    fi
  ]
)

m4_define(
  [M4_DETECT_MAGMA],
  [
+31 −5
Original line number Diff line number Diff line
@@ -24,17 +24,26 @@
#include <fstream>
#include <hdf5.h>
#include <string>

#ifdef _OPENMP
#include <omp.h>
#endif

#ifdef USE_MPI
#ifndef MPI_VERSION
#include <mpi.h>
#endif
#endif

#ifdef USE_NVTX
#include <nvtx3/nvToolsExt.h>
#endif

#define USE_CUBLAS 1
#ifdef USE_CUBLAS
#include <cuda_runtime.h>
#endif

#ifdef USE_MAGMA
#include <cuda_runtime.h>
#endif
@@ -117,10 +126,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
  Logger *logger = new Logger(LOG_DEBG);
  int device_count = 0;

#ifdef USE_CUBLAS
  cudaGetDeviceCount(&device_count);
  logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
#elif defined USE_MAGMA
  //===========
  // Initialise MAGMA
  //===========
#ifdef USE_MAGMA
  // GMu note: MAGMA does not necessarily rely on CUDA, it may just as well run on openCL or HIP, we should consider alternative ways to detect the number of devices if MAGMA is not using CUDA but something else
  cudaGetDeviceCount(&device_count);
  logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
@@ -136,8 +148,8 @@ void cluster(const string& config_file, const string& data_file, const string& o
    delete logger;
    return;
  }
#endif
  // end MAGMA initialisation
#endif

  //===========================
  // the following only happens on MPI process 0
@@ -279,7 +291,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
      // Create empty virtual binary file
      VirtualBinaryFile *vtppoanp = new VirtualBinaryFile();
      string tppoan_name = output_path + "/c_TPPOAN";
#ifdef USE_MAGMA
#ifdef USE_CUBLAS
      logger->log("INFO: using CUBLAS calls.\n", LOG_INFO);
#elif defined USE_MAGMA
      logger->log("INFO: using MAGMA calls.\n", LOG_INFO);
#elif defined USE_LAPACK
      logger->log("INFO: using LAPACK calls.\n", LOG_INFO);
@@ -549,7 +563,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
      
    delete sconf;
    delete gconf;
#ifdef USE_MAGMA
#ifdef USE_CUBLAS
    // just a placeholder to skip magma finalisation if we are using cublas
#elif defined USE_MAGMA
    logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
    magma_finalize();
#endif
@@ -672,7 +688,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
    delete sconf;
    delete gconf;
#endif
#ifdef USE_MAGMA
#ifdef USE_CUBLAS
    // placeholder to avoid magma if using cublas
#elif defined USE_MAGMA
    logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
    magma_finalize();
#endif
@@ -790,6 +808,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  string outam0_name = output_path + "/c_AM0_JXI" + to_string(jxi488) + ".txt";
  sprintf(virtual_line, " AM matrix before CMS\n");
  outam0->append_line(virtual_line);
  sprintf(virtual_line, " %d\n", ndit);
  outam0->append_line(virtual_line);  
  sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
  outam0->append_line(virtual_line);
  write_dcomplex_matrix(outam0, cid->am, ndit, ndit);
@@ -802,6 +822,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
  sprintf(virtual_line, " AM matrix after CMS before LUCIN\n");
  outam1->append_line(virtual_line);
  sprintf(virtual_line, " %d\n", ndit);
  outam1->append_line(virtual_line);  
  sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
  outam1->append_line(virtual_line);
  write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1);
@@ -838,6 +860,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  string outam2_name = output_path + "/c_AM2_JXI" + to_string(jxi488) + ".txt";
  sprintf(virtual_line, " AM matrix after LUCIN before ZTM\n");
  outam2->append_line(virtual_line);
  sprintf(virtual_line, " %d\n", ndit);
  outam2->append_line(virtual_line);  
  sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
  outam2->append_line(virtual_line);
  write_dcomplex_matrix(outam2, cid->am, ndit, ndit);
@@ -867,6 +891,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  string outam3_name = output_path + "/c_AM3_JXI" + to_string(jxi488) + ".txt";
  sprintf(virtual_line, " AM matrix after ZTM\n");
  outam3->append_line(virtual_line);
  sprintf(virtual_line, " %d\n", ndit);
  outam3->append_line(virtual_line);  
  sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
  outam3->append_line(virtual_line);
  write_dcomplex_matrix(outam3, cid->am, ndit, ndit);
+50 −0
Original line number Diff line number Diff line
/* Copyright (C) 2024   INAF - Osservatorio Astronomico di Cagliari

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   A copy of the GNU General Public License is distributed along with
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file cublas_calls.h
 *
 * \brief C++ interface to CUBLAS calls.
 *
 */

#ifndef INCLUDE_CUBLAS_CALLS_H_
#define INCLUDE_CUBLAS_CALLS_H_

/*! \brief Invert a complex matrix with double precision elements.
 *
 * Use CUBLAS to perform an in-place matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param device_id: `int` ID of the device for matrix inversion offloading.
 */
void cublas_zinvert(dcomplex **mat, np_int n, int device_id);

/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution
 *
 * Use CUBLAS to perform matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param maxrefiters: `int` Maximum number of refinement iterations to apply.
 * \param accuracygoal: `double` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy
 * \param device_id: `int` ID of the device for matrix inversion offloading.
 */
void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id);

#endif
+17 −1
Original line number Diff line number Diff line
@@ -38,6 +38,16 @@
#endif
#endif

// define by hand for a first test
#define USE_CUBLAS 1
#ifdef USE_CUBLAS
// define by hand for a first test
//#define USE_REFINEMENT 1
#ifndef INCLUDE_CUBLAS_CALLS_H_
#include "../include/cublas_calls.h"
#endif
#endif

#ifndef INCLUDE_ALGEBRAIC_H_
#include "../include/algebraic.h"
#endif
@@ -50,7 +60,13 @@ using namespace std;

void invert_matrix(dcomplex **mat, np_int size, int &ier, int &maxrefiters, double &accuracygoal, int refinemode, np_int max_size, int target_device) {
  ier = 0;
#ifdef USE_MAGMA
#ifdef USE_CUBLAS
#ifdef USE_REFINEMENT
  cublas_zinvert_and_refine(mat, size, maxrefiters, accuracygoal, refinemode, target_device);
#else
  cublas_zinvert(mat, size, target_device);
#endif
#elif defined USE_MAGMA
#ifdef USE_REFINEMENT
  // try using the iterative refinement to obtain a more accurate solution
  // we pass to magma_zinvert_and_refine() the accuracygoal in, get the actual
+232 −0
Original line number Diff line number Diff line
/* Copyright (C) 2024   INAF - Osservatorio Astronomico di Cagliari

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   A copy of the GNU General Public License is distributed along with
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file cublas_calls.cpp
 *
 * \brief Implementation of the interface with CUBLAS libraries.
 */
#ifndef INCLUDE_TYPES_H_
#include "../include/types.h"
#endif

#define USE_CUBLAS 1
#ifdef USE_CUBLAS

#ifndef INCLUDE_CUBLAS_CALLS_H_
#include "../include/cublas_calls.h"
#endif

#include <limits>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#ifdef USE_ILP64
#define CUZGEMM cublasZgemm_64
#define CUZAXPY cublasZaxpy_64
#define CUIZAMAX cublasIzamax_64
#else
#define CUZGEMM cublasZgemm
#define CUZAXPY cublasZaxpy
#define CUIZAMAX cublasIzamax
#endif

#define cudacall(call)							\
  do									\
    {									\
      cudaError_t err = (call);						\
      if(cudaSuccess != err)						\
        {								\
	  fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
	  cudaDeviceReset();						\
	  exit(EXIT_FAILURE);						\
        }								\
    }									\
  while (0)

#define cublascall(call)						\
  do									\
    {									\
      cublasStatus_t status = (call);					\
      if(CUBLAS_STATUS_SUCCESS != status)				\
        {								\
	  fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
	  cudaDeviceReset();						\
	  exit(EXIT_FAILURE);						\
        }								\
      									\
    }									\
  while(0)

void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
  cudacall(cudaSetDevice(device_id));
  cublasHandle_t handle;
  cublascall(cublasCreate_v2(&handle));
  int batchsize = 1;
  int *piv, *info; // array of pivot indices
  np_int m = (np_int) n; // changed rows; a - mxm matrix
  np_int mm = m * m; // size of a
  cudacall(cudaMalloc<int>(&piv, m*batchsize*sizeof(int)));
  cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
  cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
  cuDoubleComplex *d_a; // pointer to first element on device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
  cuDoubleComplex **batch_d_a;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
  cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
  cuDoubleComplex **batch_d_c;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
  cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  cudaFree(batch_d_a);
  cudaFree(batch_d_c);
  cudaFree(piv);
  cudaFree(info);
  cudaFree(d_a);
  cudaFree(d_c);
  cublasDestroy_v2(handle);

}

void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) {
  cudacall(cudaSetDevice(device_id));
  cublasHandle_t handle;
  cublascall(cublasCreate_v2(&handle));
  int batchsize = 1;
  int *piv, *info; // array of pivot indices
  np_int m = (np_int) n; // changed rows; a - mxm matrix
  np_int mm = m * m; // size of a
  cudacall(cudaMalloc<int>(&piv, m*batchsize*sizeof(int)));
  cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
  cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
  cuDoubleComplex *d_a; // pointer to first element on device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
  cuDoubleComplex **batch_d_a;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
  cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
  cuDoubleComplex **batch_d_c;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
  //cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  cudaFree(batch_d_a);
  cudaFree(batch_d_c);
  cudaFree(piv);
  cudaFree(info);
  cuDoubleComplex *d_a_residual;
  cuDoubleComplex *d_a_refine;
  cuDoubleComplex *d_id;
  if (maxiters>0) {
    // copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one
    cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use  
    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, m*m*sizeof(cuDoubleComplex)));
    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, m*m*sizeof(cuDoubleComplex)));
    // allocate memory for the temporary matrix products
    dcomplex *native_id = new dcomplex[m];
    for (np_int i=0; i<m; i++) native_id[i] = 1;
    cuDoubleComplex *m_id = (cuDoubleComplex *) &(native_id[0]);
    // fill it with 1
    cudacall(cudaMalloc<cuDoubleComplex>(&d_id, m*sizeof(cuDoubleComplex)));
    cudacall(cudaMemcpy(d_id, m_id, m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
    delete[] native_id; // free identity vector on host
  }
  bool iteraterefine = true;
  if (maxiters>0) {
    cuDoubleComplex cu_mone;
    (((double *) &(cu_mone))[0]) = -1;
    (((double *) &(cu_mone))[1]) = 0;
    cuDoubleComplex cu_one;
    (((double *) &(cu_one))[0]) = 1;
    (((double *) &(cu_one))[1]) = 0;
    cuDoubleComplex cu_zero;
    (((double *) &(cu_zero))[0]) = 0;
    (((double *) &(cu_zero))[1]) = 0;
    // multiply minus the original matrix times the inverse matrix
    // NOTE: factors in zgemm are swapped because zgemm is designed for column-major
    // Fortran-style arrays, whereas our arrays are C-style row-major.
    cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
    // add the identity to the product
    cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
    double oldmax=0;
    if (refinemode >0) {
      np_int maxindex;
      // find the maximum absolute value of the residual
      cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
      cuDoubleComplex cublasmax;
      // transfer the maximum value to the host
      cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
      // take the module
      oldmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
      printf("Initial max residue = %g\n", oldmax);
      if (oldmax < accuracygoal) iteraterefine = false;
    }
    // begin correction loop (should iterate maxiters times)
    int iter;
    for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
      // multiply the inverse times the residual, add to the initial inverse
      cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_one, d_a_residual, m, d_c, m, &cu_zero, d_a_refine, m));
      // add to the initial inverse
      cublascall(CUZAXPY(handle, mm, &cu_one, d_a_refine, 1, d_c, 1));
      // multiply minus the original matrix times the new inverse matrix
      cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
      // add the identity to the product
      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
	// find the maximum absolute value of the residual
	np_int maxindex;
	cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
	// transfer the maximum value to the host
	cuDoubleComplex cublasmax;
	cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
	// take the module
	double newmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
	printf("Max residue after %d iterations = %g\n", iter+1, newmax);
	// if the maximum in the residual decreased from the previous iteration,
	// update oldmax and go on, otherwise no point further iterating refinements
	if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false;
	oldmax = newmax;
      }
    }
    // if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy
    if (refinemode==2) maxiters = iter;
    accuracygoal = oldmax;
    // end correction loop
  }
  // copy the final refined inverted matrix back from device to host
  cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  // free temporary device arrays 
  cudaFree(d_a);
  cudaFree(d_c);
  if (maxiters>0) {
    cudaFree(d_id);
    cudaFree(d_a_refine);
    cudaFree(d_a_residual);
  }

  cublasDestroy_v2(handle);

}


#endif
Loading