Commit a47e3dbc authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Implement cuBLAS based refined inversion

parent d7fa5caf
Loading
Loading
Loading
Loading
+9 −18
Original line number Diff line number Diff line
@@ -14,7 +14,8 @@
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file cublas_calls.h
/**
 * \file cublas_calls.h
 *
 * \brief C++ interface to CUBLAS calls.
 *
@@ -23,29 +24,19 @@
#ifndef INCLUDE_CUBLAS_CALLS_H_
#define INCLUDE_CUBLAS_CALLS_H_

/*! \brief Invert a complex matrix with double precision elements.
/**
 * \brief Invert a complex matrix with double precision elements.
 *
 * Use CUBLAS to perform an in-place matrix inversion for a complex
 * Use cuBLAS to perform an in-place matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param device_id: `int` ID of the device for matrix inversion offloading.
 * \param rs: `const RuntimeSettings&` Runtime settings instance.
 */
void cublas_zinvert(dcomplex **mat, np_int n, int device_id);

/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution
 *
 * Use CUBLAS to perform matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param maxrefiters: `int` Maximum number of refinement iterations to apply.
 * \param accuracygoal: `double &` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy.
 * \param refinemode: `int` Flag to choose the refinement mode.
 * \param device_id: `int` ID of the device for matrix inversion offloading.
 */
void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id);
void cublas_zinvert(
  dcomplex **mat, np_int n, int device_id, const RuntimeSettings& rs=RuntimeSettings()
);

#endif
+1 −5
Original line number Diff line number Diff line
@@ -76,11 +76,7 @@ void invert_matrix(
// #endif
  magma_zinvert(mat, size, ier, target_device, rs);
#elif defined USE_CUBLAS
#ifdef USE_REFINEMENT
  cublas_zinvert_and_refine(mat, size, maxrefiters, accuracygoal, refinemode, target_device);
#else
  cublas_zinvert(mat, size, target_device);
#endif
  cublas_zinvert(mat, size, target_device, rs);
#elif defined USE_LAPACK
  zinvert(mat, size, ier, rs);
#else
+129 −147
Original line number Diff line number Diff line
@@ -14,25 +14,36 @@
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file cublas_calls.cpp
/**
 * \file cublas_calls.cpp
 *
 * \brief Implementation of the interface with CUBLAS libraries.
 */

#ifdef USE_CUBLAS

#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cstdio>
#include <limits>
#include <string>

#ifndef INCLUDE_TYPES_H_
#include "../include/types.h"
#endif

//#define USE_CUBLAS 1
#ifdef USE_CUBLAS
#ifndef INCLUDE_LOGGING_H_
#include "../include/logging.h"
#endif

#ifndef INCLUDE_CONFIGURATION_H_
#include "../include/Configuration.h"
#endif

#ifndef INCLUDE_CUBLAS_CALLS_H_
#include "../include/cublas_calls.h"
#endif

#include <limits>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#ifdef USE_ILP64
#define CUZGEMM cublasZgemm_64
#define CUZAXPY cublasZaxpy_64
@@ -70,11 +81,16 @@
    }									\
  while(0)

void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
using namespace std;

void cublas_zinvert(dcomplex **mat, np_int n, int device_id, const RuntimeSettings& rs) {
  char buffer[128];
  string message;
  cudacall(cudaSetDevice(device_id));
  cublasHandle_t handle;
  cublascall(cublasCreate_v2(&handle));
  int batchsize = 1;
  if (rs.invert_mode == RuntimeSettings::INV_MODE_LU) {
    int *piv, *info; // array of pivot indices
    np_int m = (np_int) n; // changed rows; a - mxm matrix
    np_int mm = m * m; // size of a
@@ -83,61 +99,31 @@ void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
    cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
    cuDoubleComplex *d_a; // pointer to first element on device
    cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
    cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy a -> d_a
    cuDoubleComplex **batch_d_a;
    cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
    cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
    cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize)); // call to ZGETRF
    cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
    cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
    cuDoubleComplex **batch_d_c;
    cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
    cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
    cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
  cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  cudaFree(batch_d_a);
  cudaFree(batch_d_c);
  cudaFree(piv);
  cudaFree(info);
  cudaFree(d_a);
  cudaFree(d_c);
  cublasDestroy_v2(handle);

}
    if (rs.use_refinement) {
      cuDoubleComplex cu_mone;
      (((double *) &(cu_mone))[0]) = -1;
      (((double *) &(cu_mone))[1]) = 0;
      cuDoubleComplex cu_one;
      (((double *) &(cu_one))[0]) = 1;
      (((double *) &(cu_one))[1]) = 0;
      cuDoubleComplex cu_zero;
      (((double *) &(cu_zero))[0]) = 0;
      (((double *) &(cu_zero))[1]) = 0;

void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) {
  cudacall(cudaSetDevice(device_id));
  cublasHandle_t handle;
  cublascall(cublasCreate_v2(&handle));
  int batchsize = 1;
  int *piv, *info; // array of pivot indices
  np_int m = (np_int) n; // changed rows; a - mxm matrix
  np_int mm = m * m; // size of a
  cudacall(cudaMalloc<int>(&piv, m*batchsize*sizeof(int)));
  cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
  cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
  cuDoubleComplex *d_a; // pointer to first element on device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
  cuDoubleComplex **batch_d_a;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
  cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
  cuDoubleComplex **batch_d_c;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
  //cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  cudaFree(batch_d_a);
  cudaFree(batch_d_c);
  cudaFree(piv);
  cudaFree(info);
      cuDoubleComplex *d_a_residual;
      cuDoubleComplex *d_a_refine;
      cuDoubleComplex *d_id;
  if (maxiters>0) {
      // copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one
      cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use  
      cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, m*m*sizeof(cuDoubleComplex)));
@@ -150,18 +136,6 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
      cudacall(cudaMalloc<cuDoubleComplex>(&d_id, sizeof(cuDoubleComplex)));
      cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
      delete[] native_id; // free identity vector on host
  }
  bool iteraterefine = true;
  if (maxiters>0) {
    cuDoubleComplex cu_mone;
    (((double *) &(cu_mone))[0]) = -1;
    (((double *) &(cu_mone))[1]) = 0;
    cuDoubleComplex cu_one;
    (((double *) &(cu_one))[0]) = 1;
    (((double *) &(cu_one))[1]) = 0;
    cuDoubleComplex cu_zero;
    (((double *) &(cu_zero))[0]) = 0;
    (((double *) &(cu_zero))[1]) = 0;
      // multiply minus the original matrix times the inverse matrix
      // NOTE: factors in zgemm are swapped because zgemm is designed for column-major
      // Fortran-style arrays, whereas our arrays are C-style row-major.
@@ -169,7 +143,6 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
      // add the identity to the product
      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
      double oldmax=0;
    if (refinemode >0) {
      np_int maxindex;
      // find the maximum absolute value of the residual
      cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
@@ -178,12 +151,12 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
      cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
      // take the module
      oldmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
      printf("Initial max residue = %g\n", oldmax);
      if (oldmax < accuracygoal) iteraterefine = false;
    }
    // begin correction loop (should iterate maxiters times)
    int iter;
    for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
      //printf("Initial max residue = %g\n", oldmax);
      sprintf(buffer, "INFO: Initial max residue is %.5le.\n", oldmax);
      message = buffer;
      rs.logger->log(message);
      if (oldmax > rs.accuracy_goal) {
	for (int iter = 0; iter < rs.max_ref_iters; iter++) {
	  // multiply the inverse times the residual, add to the initial inverse
	  cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_one, d_a_residual, m, d_c, m, &cu_zero, d_a_refine, m));
	  // add to the initial inverse
@@ -192,7 +165,6 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
	  cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
	  // add the identity to the product
	  cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
	  // find the maximum absolute value of the residual
	  np_int maxindex;
	  cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
@@ -201,32 +173,42 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
	  cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
	  // take the module
	  double newmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
	printf("Max residue after %d iterations = %g\n", iter+1, newmax);
	  sprintf(buffer, "DEBUG: Maximum residue after %d iterations = %.5le\n", iter+1, newmax);
	  message = buffer;
	  rs.logger->log(message);
	  // if the maximum in the residual decreased from the previous iteration,
	  // update oldmax and go on, otherwise no point further iterating refinements
	if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false;
	  // if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false;
	  if (newmax < rs.accuracy_goal) {
	    message = "INFO: optimal convergence achieved. Stopping.\n";
	    rs.logger->log(message);
	    break; // iter for
	  }
	  if (newmax > 0.99 * oldmax) {
	    message = "WARN: cannot improve further. Stopping.\n";
	    rs.logger->log(message, LOG_WARN);
	    break; // iter for
	  }
	  oldmax = newmax;
	}
      } else {
	message = "INFO: starting accuracy is already good.\n";
	rs.logger->log(message);
      }
    // if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy
    if (refinemode==2) maxiters = iter;
    accuracygoal = oldmax;
    // end correction loop
    }
  // copy the final refined inverted matrix back from device to host
    cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  // free temporary device arrays 
    cudaFree(batch_d_a);
    cudaFree(batch_d_c);
    cudaFree(piv);
    cudaFree(info);
    cudaFree(d_a);
    cudaFree(d_c);
  if (maxiters>0) {
    cudaFree(d_id);
    cudaFree(d_a_refine);
    cudaFree(d_a_residual);
  } else {
    message = "ERROR: cuBLAS solver only implements LU inversion!\n";
    rs.logger->err(message);
    exit(1);
  }

  cublasDestroy_v2(handle);

}


#endif