Implement cuBLAS based refined inversion (a47e3dbc) · Commits · Giacomo Mulas / NP_TMcode

src/include/cublas_calls.h

+9 −18

Original line number	Diff line number	Diff line
		@@ -14,7 +14,8 @@
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file cublas_calls.h
		/**
		* \file cublas_calls.h
		*
		* \brief C++ interface to CUBLAS calls.
		*
		@@ -23,29 +24,19 @@
		#ifndef INCLUDE_CUBLAS_CALLS_H_
		#define INCLUDE_CUBLAS_CALLS_H_

		/*! \brief Invert a complex matrix with double precision elements.
		/**
		* \brief Invert a complex matrix with double precision elements.
		*
		* Use CUBLAS to perform an in-place matrix inversion for a complex
		* Use cuBLAS to perform an in-place matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param device_id: `int` ID of the device for matrix inversion offloading.
		* \param rs: `const RuntimeSettings&` Runtime settings instance.
		*/
		void cublas_zinvert(dcomplex **mat, np_int n, int device_id);

		/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution
		*
		* Use CUBLAS to perform matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param maxrefiters: `int` Maximum number of refinement iterations to apply.
		* \param accuracygoal: `double &` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy.
		* \param refinemode: `int` Flag to choose the refinement mode.
		* \param device_id: `int` ID of the device for matrix inversion offloading.
		*/
		void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id);
		void cublas_zinvert(
		dcomplex **mat, np_int n, int device_id, const RuntimeSettings& rs=RuntimeSettings()
		);

		#endif

src/libnptm/algebraic.cpp

+1 −5

Original line number	Diff line number	Diff line
		@@ -76,11 +76,7 @@ void invert_matrix(
		// #endif
		magma_zinvert(mat, size, ier, target_device, rs);
		#elif defined USE_CUBLAS
		#ifdef USE_REFINEMENT
		cublas_zinvert_and_refine(mat, size, maxrefiters, accuracygoal, refinemode, target_device);
		#else
		cublas_zinvert(mat, size, target_device);
		#endif
		cublas_zinvert(mat, size, target_device, rs);
		#elif defined USE_LAPACK
		zinvert(mat, size, ier, rs);
		#else

src/libnptm/cublas_calls.cpp

+129 −147

Original line number	Diff line number	Diff line
		@@ -14,25 +14,36 @@
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file cublas_calls.cpp
		/**
		* \file cublas_calls.cpp
		*
		* \brief Implementation of the interface with CUBLAS libraries.
		*/

		#ifdef USE_CUBLAS

		#include <cublas_v2.h>
		#include <cuda_runtime.h>
		#include <cstdio>
		#include <limits>
		#include <string>

		#ifndef INCLUDE_TYPES_H_
		#include "../include/types.h"
		#endif

		//#define USE_CUBLAS 1
		#ifdef USE_CUBLAS
		#ifndef INCLUDE_LOGGING_H_
		#include "../include/logging.h"
		#endif

		#ifndef INCLUDE_CONFIGURATION_H_
		#include "../include/Configuration.h"
		#endif

		#ifndef INCLUDE_CUBLAS_CALLS_H_
		#include "../include/cublas_calls.h"
		#endif

		#include <limits>
		#include <cuda_runtime.h>
		#include <cublas_v2.h>

		#ifdef USE_ILP64
		#define CUZGEMM cublasZgemm_64
		#define CUZAXPY cublasZaxpy_64
		@@ -70,11 +81,16 @@
		} \
		while(0)

		void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
		using namespace std;

		void cublas_zinvert(dcomplex **mat, np_int n, int device_id, const RuntimeSettings& rs) {
		char buffer[128];
		string message;
		cudacall(cudaSetDevice(device_id));
		cublasHandle_t handle;
		cublascall(cublasCreate_v2(&handle));
		int batchsize = 1;
		if (rs.invert_mode == RuntimeSettings::INV_MODE_LU) {
		int piv, info; // array of pivot indices
		np_int m = (np_int) n; // changed rows; a - mxm matrix
		np_int mm = m * m; // size of a
		@@ -83,61 +99,31 @@ void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
		cuDoubleComplex a = (cuDoubleComplex )&(mat[0][0]); // pointer to first element on host
		cuDoubleComplex *d_a; // pointer to first element on device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a,mmsizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy a -> d_a
		cuDoubleComplex **batch_d_a;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_a,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_a, &d_a, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
		cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize)); // call to ZGETRF
		cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_c,mmsizeof(cuDoubleComplex)));
		cuDoubleComplex **batch_d_c;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_c,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_c, &d_c, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
		cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		cudaFree(batch_d_a);
		cudaFree(batch_d_c);
		cudaFree(piv);
		cudaFree(info);
		cudaFree(d_a);
		cudaFree(d_c);
		cublasDestroy_v2(handle);

		}
		if (rs.use_refinement) {
		cuDoubleComplex cu_mone;
		(((double *) &(cu_mone))[0]) = -1;
		(((double *) &(cu_mone))[1]) = 0;
		cuDoubleComplex cu_one;
		(((double *) &(cu_one))[0]) = 1;
		(((double *) &(cu_one))[1]) = 0;
		cuDoubleComplex cu_zero;
		(((double *) &(cu_zero))[0]) = 0;
		(((double *) &(cu_zero))[1]) = 0;

		void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) {
		cudacall(cudaSetDevice(device_id));
		cublasHandle_t handle;
		cublascall(cublasCreate_v2(&handle));
		int batchsize = 1;
		int piv, info; // array of pivot indices
		np_int m = (np_int) n; // changed rows; a - mxm matrix
		np_int mm = m * m; // size of a
		cudacall(cudaMalloc<int>(&piv, mbatchsizesizeof(int)));
		cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
		cuDoubleComplex a = (cuDoubleComplex )&(mat[0][0]); // pointer to first element on host
		cuDoubleComplex *d_a; // pointer to first element on device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a,mmsizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
		cuDoubleComplex **batch_d_a;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_a,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_a, &d_a, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
		cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_c,mmsizeof(cuDoubleComplex)));
		cuDoubleComplex **batch_d_c;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_c,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_c, &d_c, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
		//cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		cudaFree(batch_d_a);
		cudaFree(batch_d_c);
		cudaFree(piv);
		cudaFree(info);
		cuDoubleComplex *d_a_residual;
		cuDoubleComplex *d_a_refine;
		cuDoubleComplex *d_id;
		if (maxiters>0) {
		// copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, mmsizeof(cuDoubleComplex)));
		@@ -150,18 +136,6 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		cudacall(cudaMalloc<cuDoubleComplex>(&d_id, sizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
		delete[] native_id; // free identity vector on host
		}
		bool iteraterefine = true;
		if (maxiters>0) {
		cuDoubleComplex cu_mone;
		(((double *) &(cu_mone))[0]) = -1;
		(((double *) &(cu_mone))[1]) = 0;
		cuDoubleComplex cu_one;
		(((double *) &(cu_one))[0]) = 1;
		(((double *) &(cu_one))[1]) = 0;
		cuDoubleComplex cu_zero;
		(((double *) &(cu_zero))[0]) = 0;
		(((double *) &(cu_zero))[1]) = 0;
		// multiply minus the original matrix times the inverse matrix
		// NOTE: factors in zgemm are swapped because zgemm is designed for column-major
		// Fortran-style arrays, whereas our arrays are C-style row-major.
		@@ -169,7 +143,6 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
		double oldmax=0;
		if (refinemode >0) {
		np_int maxindex;
		// find the maximum absolute value of the residual
		cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
		@@ -178,12 +151,12 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
		// take the module
		oldmax = cabs( (((double ) &(cublasmax))[0]) + I(((double *) &(cublasmax))[1]));
		printf("Initial max residue = %g\n", oldmax);
		if (oldmax < accuracygoal) iteraterefine = false;
		}
		// begin correction loop (should iterate maxiters times)
		int iter;
		for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
		//printf("Initial max residue = %g\n", oldmax);
		sprintf(buffer, "INFO: Initial max residue is %.5le.\n", oldmax);
		message = buffer;
		rs.logger->log(message);
		if (oldmax > rs.accuracy_goal) {
		for (int iter = 0; iter < rs.max_ref_iters; iter++) {
		// multiply the inverse times the residual, add to the initial inverse
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_one, d_a_residual, m, d_c, m, &cu_zero, d_a_refine, m));
		// add to the initial inverse
		@@ -192,7 +165,6 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
		if ((refinemode==2) \|\| ((refinemode==1) && (iter == (maxiters-1)))) {
		// find the maximum absolute value of the residual
		np_int maxindex;
		cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
		@@ -201,32 +173,42 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
		// take the module
		double newmax = cabs( (((double ) &(cublasmax))[0]) + I(((double *) &(cublasmax))[1]));
		printf("Max residue after %d iterations = %g\n", iter+1, newmax);
		sprintf(buffer, "DEBUG: Maximum residue after %d iterations = %.5le\n", iter+1, newmax);
		message = buffer;
		rs.logger->log(message);
		// if the maximum in the residual decreased from the previous iteration,
		// update oldmax and go on, otherwise no point further iterating refinements
		if ((refinemode==2) && ((newmax > oldmax)\|\|(newmax < accuracygoal))) iteraterefine = false;
		// if ((refinemode==2) && ((newmax > oldmax)\|\|(newmax < accuracygoal))) iteraterefine = false;
		if (newmax < rs.accuracy_goal) {
		message = "INFO: optimal convergence achieved. Stopping.\n";
		rs.logger->log(message);
		break; // iter for
		}
		if (newmax > 0.99 * oldmax) {
		message = "WARN: cannot improve further. Stopping.\n";
		rs.logger->log(message, LOG_WARN);
		break; // iter for
		}
		oldmax = newmax;
		}
		} else {
		message = "INFO: starting accuracy is already good.\n";
		rs.logger->log(message);
		}
		// if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy
		if (refinemode==2) maxiters = iter;
		accuracygoal = oldmax;
		// end correction loop
		}
		// copy the final refined inverted matrix back from device to host
		cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		// free temporary device arrays
		cudaFree(batch_d_a);
		cudaFree(batch_d_c);
		cudaFree(piv);
		cudaFree(info);
		cudaFree(d_a);
		cudaFree(d_c);
		if (maxiters>0) {
		cudaFree(d_id);
		cudaFree(d_a_refine);
		cudaFree(d_a_residual);
		} else {
		message = "ERROR: cuBLAS solver only implements LU inversion!\n";
		rs.logger->err(message);
		exit(1);
		}

		cublasDestroy_v2(handle);

		}


		#endif