first implementation of CUBLAS-based matrix inversion (50efe30f) · Commits · Giacomo Mulas / NP_TMcode

build/configure.ac

+63 −0

Original line number	Diff line number	Diff line
		@@ -163,6 +163,69 @@ m4_define(
		]
		)

		m4_define(
		[M4_DETECT_CUBLAS],
		[
		pkg-config --version > /dev/null
		use_pkg_config=$?
		if test "x${CUDAFLAGS}${CUDALDFLAGS}" = "x"; then
		if test "x$use_pkg_config" = "x0"; then
		# pkg-config is available
		declare -a pkg_array=$(pkg-config --list-all \| grep cublas)
		for i in "${pkg_array[[@]]}"; do echo "$i" \| cut --delimiter=" " -f1; done \| grep cublas > /dev/null
		result=$?
		if test "x$result" = "x0"; then
		# CUBLAS detected
		cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" \| cut --delimiter=" " -f1; done \| grep cublas)
		CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg})
		CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg})
		fi # end of CUBLAS runtime decision tree
		declare -a pkg_array=$(pkg-config --list-all \| grep cudart)
		for i in "${pkg_array[[@]]}"; do echo "$i" \| cut --delimiter=" " -f1; done \| grep cudart > /dev/null
		result=$?
		if test "x$result" = "x0"; then
		# CUDA runtime detected
		cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" \| cut --delimiter=" " -f1; done \| grep cudart)
		CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg})
		CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg})
		fi # end of CUDA runtime decision tree
		echo $CUDALDFLAGS \| grep cublas > /dev/null
		cudart_check=$?
		if test "x${cudart_check}" != "x0"; then
		CUDALDFLAGS="$CUDALDFLAGS -lcublas"
		fi
		echo $CUDALDFLAGS \| grep cudart > /dev/null
		cudart_check=$?
		if test "x${cudart_check}" != "x0"; then
		CUDALDFLAGS="$CUDALDFLAGS -lcudart"
		fi
		else
		# pkg-config is not available
		if test -f /usr/local/cuda/include/cuda.h; then
		CUDAFLAGS="-I/usr/local/cuda/include"
		CUDALDFLAGS="-L/usr/local/cuda/lib64 -lcublas -lcudart"
		elif test -f /usr/include/cuda.h; then
		CUDAFLAGS="-I/usr/include"
		CUDALDFLAGS="-lcublas -lcudart"
		elif test "x$CUDA_HOME" != "x"; then
		CUDAFLAGS="-I${CUDA_HOME}/include"
		CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcublas -lcudart"
		fi
		fi # end of pkg-config decision tree
		fi # end of CUDAFLAGS user override protection
		if test "x$CUDAFLAGS" != "x" then
		# somehow CUDAFLAGS was defined
		export CUDAFLAGS
		export CUBLASFLAGS="-DUSE_CUBLAS ${CUDAFLAGS}"
		fi
		if test "x$CUDALDFLAGS" != "x" then
		# somehow CUDALDFLAGS was defined
		export CUDALDFLAGS
		export CUBLASLDFLAGS="${CUDALDFLAGS}"
		fi
		]
		)

		m4_define(
		[M4_DETECT_MAGMA],
		[

src/cluster/cluster.cpp

+31 −5

Original line number	Diff line number	Diff line
		@@ -24,17 +24,26 @@
		#include <fstream>
		#include <hdf5.h>
		#include <string>

		#ifdef _OPENMP
		#include <omp.h>
		#endif

		#ifdef USE_MPI
		#ifndef MPI_VERSION
		#include <mpi.h>
		#endif
		#endif

		#ifdef USE_NVTX
		#include <nvtx3/nvToolsExt.h>
		#endif

		#define USE_CUBLAS 1
		#ifdef USE_CUBLAS
		#include <cuda_runtime.h>
		#endif

		#ifdef USE_MAGMA
		#include <cuda_runtime.h>
		#endif
		@@ -117,10 +126,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
		Logger *logger = new Logger(LOG_DEBG);
		int device_count = 0;

		#ifdef USE_CUBLAS
		cudaGetDeviceCount(&device_count);
		logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
		#elif defined USE_MAGMA
		//===========
		// Initialise MAGMA
		//===========
		#ifdef USE_MAGMA
		// GMu note: MAGMA does not necessarily rely on CUDA, it may just as well run on openCL or HIP, we should consider alternative ways to detect the number of devices if MAGMA is not using CUDA but something else
		cudaGetDeviceCount(&device_count);
		logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
		@@ -136,8 +148,8 @@ void cluster(const string& config_file, const string& data_file, const string& o
		delete logger;
		return;
		}
		#endif
		// end MAGMA initialisation
		#endif

		//===========================
		// the following only happens on MPI process 0
		@@ -279,7 +291,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
		// Create empty virtual binary file
		VirtualBinaryFile *vtppoanp = new VirtualBinaryFile();
		string tppoan_name = output_path + "/c_TPPOAN";
		#ifdef USE_MAGMA
		#ifdef USE_CUBLAS
		logger->log("INFO: using CUBLAS calls.\n", LOG_INFO);
		#elif defined USE_MAGMA
		logger->log("INFO: using MAGMA calls.\n", LOG_INFO);
		#elif defined USE_LAPACK
		logger->log("INFO: using LAPACK calls.\n", LOG_INFO);
		@@ -549,7 +563,9 @@ void cluster(const string& config_file, const string& data_file, const string& o

		delete sconf;
		delete gconf;
		#ifdef USE_MAGMA
		#ifdef USE_CUBLAS
		// just a placeholder to skip magma finalisation if we are using cublas
		#elif defined USE_MAGMA
		logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
		magma_finalize();
		#endif
		@@ -672,7 +688,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
		delete sconf;
		delete gconf;
		#endif
		#ifdef USE_MAGMA
		#ifdef USE_CUBLAS
		// placeholder to avoid magma if using cublas
		#elif defined USE_MAGMA
		logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
		magma_finalize();
		#endif
		@@ -790,6 +808,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
		string outam0_name = output_path + "/c_AM0_JXI" + to_string(jxi488) + ".txt";
		sprintf(virtual_line, " AM matrix before CMS\n");
		outam0->append_line(virtual_line);
		sprintf(virtual_line, " %d\n", ndit);
		outam0->append_line(virtual_line);
		sprintf(virtual_line, " I1+1 I2+1 Real Imag\n");
		outam0->append_line(virtual_line);
		write_dcomplex_matrix(outam0, cid->am, ndit, ndit);
		@@ -802,6 +822,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
		string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
		sprintf(virtual_line, " AM matrix after CMS before LUCIN\n");
		outam1->append_line(virtual_line);
		sprintf(virtual_line, " %d\n", ndit);
		outam1->append_line(virtual_line);
		sprintf(virtual_line, " I1+1 I2+1 Real Imag\n");
		outam1->append_line(virtual_line);
		write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1);
		@@ -838,6 +860,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
		string outam2_name = output_path + "/c_AM2_JXI" + to_string(jxi488) + ".txt";
		sprintf(virtual_line, " AM matrix after LUCIN before ZTM\n");
		outam2->append_line(virtual_line);
		sprintf(virtual_line, " %d\n", ndit);
		outam2->append_line(virtual_line);
		sprintf(virtual_line, " I1+1 I2+1 Real Imag\n");
		outam2->append_line(virtual_line);
		write_dcomplex_matrix(outam2, cid->am, ndit, ndit);
		@@ -867,6 +891,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
		string outam3_name = output_path + "/c_AM3_JXI" + to_string(jxi488) + ".txt";
		sprintf(virtual_line, " AM matrix after ZTM\n");
		outam3->append_line(virtual_line);
		sprintf(virtual_line, " %d\n", ndit);
		outam3->append_line(virtual_line);
		sprintf(virtual_line, " I1+1 I2+1 Real Imag\n");
		outam3->append_line(virtual_line);
		write_dcomplex_matrix(outam3, cid->am, ndit, ndit);

src/include/cublas_calls.h

0 → 100644

+50 −0

Original line number	Diff line number	Diff line
		/* Copyright (C) 2024 INAF - Osservatorio Astronomico di Cagliari

		This program is free software: you can redistribute it and/or modify
		it under the terms of the GNU General Public License as published by
		the Free Software Foundation, either version 3 of the License, or
		(at your option) any later version.

		This program is distributed in the hope that it will be useful,
		but WITHOUT ANY WARRANTY; without even the implied warranty of
		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		GNU General Public License for more details.

		A copy of the GNU General Public License is distributed along with
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file cublas_calls.h
		*
		* \brief C++ interface to CUBLAS calls.
		*
		*/

		#ifndef INCLUDE_CUBLAS_CALLS_H_
		#define INCLUDE_CUBLAS_CALLS_H_

		/*! \brief Invert a complex matrix with double precision elements.
		*
		* Use CUBLAS to perform an in-place matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param device_id: `int` ID of the device for matrix inversion offloading.
		*/
		void cublas_zinvert(dcomplex **mat, np_int n, int device_id);

		/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution
		*
		* Use CUBLAS to perform matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param maxrefiters: `int` Maximum number of refinement iterations to apply.
		* \param accuracygoal: `double` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy
		* \param device_id: `int` ID of the device for matrix inversion offloading.
		*/
		void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id);

		#endif

src/libnptm/algebraic.cpp

+17 −1

Original line number	Diff line number	Diff line
		@@ -38,6 +38,16 @@
		#endif
		#endif

		// define by hand for a first test
		#define USE_CUBLAS 1
		#ifdef USE_CUBLAS
		// define by hand for a first test
		//#define USE_REFINEMENT 1
		#ifndef INCLUDE_CUBLAS_CALLS_H_
		#include "../include/cublas_calls.h"
		#endif
		#endif

		#ifndef INCLUDE_ALGEBRAIC_H_
		#include "../include/algebraic.h"
		#endif
		@@ -50,7 +60,13 @@ using namespace std;

		void invert_matrix(dcomplex **mat, np_int size, int &ier, int &maxrefiters, double &accuracygoal, int refinemode, np_int max_size, int target_device) {
		ier = 0;
		#ifdef USE_MAGMA
		#ifdef USE_CUBLAS
		#ifdef USE_REFINEMENT
		cublas_zinvert_and_refine(mat, size, maxrefiters, accuracygoal, refinemode, target_device);
		#else
		cublas_zinvert(mat, size, target_device);
		#endif
		#elif defined USE_MAGMA
		#ifdef USE_REFINEMENT
		// try using the iterative refinement to obtain a more accurate solution
		// we pass to magma_zinvert_and_refine() the accuracygoal in, get the actual

src/libnptm/cublas_calls.cpp

0 → 100644

+232 −0

Original line number	Diff line number	Diff line
		/* Copyright (C) 2024 INAF - Osservatorio Astronomico di Cagliari

		This program is free software: you can redistribute it and/or modify
		it under the terms of the GNU General Public License as published by
		the Free Software Foundation, either version 3 of the License, or
		(at your option) any later version.

		This program is distributed in the hope that it will be useful,
		but WITHOUT ANY WARRANTY; without even the implied warranty of
		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		GNU General Public License for more details.

		A copy of the GNU General Public License is distributed along with
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file cublas_calls.cpp
		*
		* \brief Implementation of the interface with CUBLAS libraries.
		*/
		#ifndef INCLUDE_TYPES_H_
		#include "../include/types.h"
		#endif

		#define USE_CUBLAS 1
		#ifdef USE_CUBLAS

		#ifndef INCLUDE_CUBLAS_CALLS_H_
		#include "../include/cublas_calls.h"
		#endif

		#include <limits>
		#include <cuda_runtime.h>
		#include <cublas_v2.h>

		#ifdef USE_ILP64
		#define CUZGEMM cublasZgemm_64
		#define CUZAXPY cublasZaxpy_64
		#define CUIZAMAX cublasIzamax_64
		#else
		#define CUZGEMM cublasZgemm
		#define CUZAXPY cublasZaxpy
		#define CUIZAMAX cublasIzamax
		#endif

		#define cudacall(call) \
		do \
		{ \
		cudaError_t err = (call); \
		if(cudaSuccess != err) \
		{ \
		fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
		cudaDeviceReset(); \
		exit(EXIT_FAILURE); \
		} \
		} \
		while (0)

		#define cublascall(call) \
		do \
		{ \
		cublasStatus_t status = (call); \
		if(CUBLAS_STATUS_SUCCESS != status) \
		{ \
		fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
		cudaDeviceReset(); \
		exit(EXIT_FAILURE); \
		} \
		\
		} \
		while(0)

		void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
		cudacall(cudaSetDevice(device_id));
		cublasHandle_t handle;
		cublascall(cublasCreate_v2(&handle));
		int batchsize = 1;
		int piv, info; // array of pivot indices
		np_int m = (np_int) n; // changed rows; a - mxm matrix
		np_int mm = m * m; // size of a
		cudacall(cudaMalloc<int>(&piv, mbatchsizesizeof(int)));
		cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
		cuDoubleComplex a = (cuDoubleComplex )&(mat[0][0]); // pointer to first element on host
		cuDoubleComplex *d_a; // pointer to first element on device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a,mmsizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
		cuDoubleComplex **batch_d_a;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_a,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_a, &d_a, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
		cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_c,mmsizeof(cuDoubleComplex)));
		cuDoubleComplex **batch_d_c;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_c,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_c, &d_c, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
		cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		cudaFree(batch_d_a);
		cudaFree(batch_d_c);
		cudaFree(piv);
		cudaFree(info);
		cudaFree(d_a);
		cudaFree(d_c);
		cublasDestroy_v2(handle);

		}

		void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) {
		cudacall(cudaSetDevice(device_id));
		cublasHandle_t handle;
		cublascall(cublasCreate_v2(&handle));
		int batchsize = 1;
		int piv, info; // array of pivot indices
		np_int m = (np_int) n; // changed rows; a - mxm matrix
		np_int mm = m * m; // size of a
		cudacall(cudaMalloc<int>(&piv, mbatchsizesizeof(int)));
		cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
		cuDoubleComplex a = (cuDoubleComplex )&(mat[0][0]); // pointer to first element on host
		cuDoubleComplex *d_a; // pointer to first element on device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a,mmsizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
		cuDoubleComplex **batch_d_a;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_a,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_a, &d_a, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
		cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_c,mmsizeof(cuDoubleComplex)));
		cuDoubleComplex **batch_d_c;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_c,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_c, &d_c, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
		//cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		cudaFree(batch_d_a);
		cudaFree(batch_d_c);
		cudaFree(piv);
		cudaFree(info);
		cuDoubleComplex *d_a_residual;
		cuDoubleComplex *d_a_refine;
		cuDoubleComplex *d_id;
		if (maxiters>0) {
		// copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, mmsizeof(cuDoubleComplex)));
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, mmsizeof(cuDoubleComplex)));
		// allocate memory for the temporary matrix products
		dcomplex *native_id = new dcomplex[m];
		for (np_int i=0; i<m; i++) native_id[i] = 1;
		cuDoubleComplex m_id = (cuDoubleComplex ) &(native_id[0]);
		// fill it with 1
		cudacall(cudaMalloc<cuDoubleComplex>(&d_id, m*sizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_id, m_id, m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
		delete[] native_id; // free identity vector on host
		}
		bool iteraterefine = true;
		if (maxiters>0) {
		cuDoubleComplex cu_mone;
		(((double *) &(cu_mone))[0]) = -1;
		(((double *) &(cu_mone))[1]) = 0;
		cuDoubleComplex cu_one;
		(((double *) &(cu_one))[0]) = 1;
		(((double *) &(cu_one))[1]) = 0;
		cuDoubleComplex cu_zero;
		(((double *) &(cu_zero))[0]) = 0;
		(((double *) &(cu_zero))[1]) = 0;
		// multiply minus the original matrix times the inverse matrix
		// NOTE: factors in zgemm are swapped because zgemm is designed for column-major
		// Fortran-style arrays, whereas our arrays are C-style row-major.
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
		double oldmax=0;
		if (refinemode >0) {
		np_int maxindex;
		// find the maximum absolute value of the residual
		cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
		cuDoubleComplex cublasmax;
		// transfer the maximum value to the host
		cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
		// take the module
		oldmax = cabs( (((double ) &(cublasmax))[0]) + I(((double *) &(cublasmax))[1]));
		printf("Initial max residue = %g\n", oldmax);
		if (oldmax < accuracygoal) iteraterefine = false;
		}
		// begin correction loop (should iterate maxiters times)
		int iter;
		for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
		// multiply the inverse times the residual, add to the initial inverse
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_one, d_a_residual, m, d_c, m, &cu_zero, d_a_refine, m));
		// add to the initial inverse
		cublascall(CUZAXPY(handle, mm, &cu_one, d_a_refine, 1, d_c, 1));
		// multiply minus the original matrix times the new inverse matrix
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
		if ((refinemode==2) \|\| ((refinemode==1) && (iter == (maxiters-1)))) {
		// find the maximum absolute value of the residual
		np_int maxindex;
		cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
		// transfer the maximum value to the host
		cuDoubleComplex cublasmax;
		cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
		// take the module
		double newmax = cabs( (((double ) &(cublasmax))[0]) + I(((double *) &(cublasmax))[1]));
		printf("Max residue after %d iterations = %g\n", iter+1, newmax);
		// if the maximum in the residual decreased from the previous iteration,
		// update oldmax and go on, otherwise no point further iterating refinements
		if ((refinemode==2) && ((newmax > oldmax)\|\|(newmax < accuracygoal))) iteraterefine = false;
		oldmax = newmax;
		}
		}
		// if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy
		if (refinemode==2) maxiters = iter;
		accuracygoal = oldmax;
		// end correction loop
		}
		// copy the final refined inverted matrix back from device to host
		cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		// free temporary device arrays
		cudaFree(d_a);
		cudaFree(d_c);
		if (maxiters>0) {
		cudaFree(d_id);
		cudaFree(d_a_refine);
		cudaFree(d_a_residual);
		}

		cublasDestroy_v2(handle);

		}


		#endif