Create back-up files of old LAPACK and cuBLAS calls (78d49a34) · Commits · Giacomo Mulas / NP_TMcode

src/include/cublas_calls_h.bak

0 → 100644

+51 −0

Original line number	Diff line number	Diff line
		/* Copyright (C) 2025 INAF - Osservatorio Astronomico di Cagliari

		This program is free software: you can redistribute it and/or modify
		it under the terms of the GNU General Public License as published by
		the Free Software Foundation, either version 3 of the License, or
		(at your option) any later version.

		This program is distributed in the hope that it will be useful,
		but WITHOUT ANY WARRANTY; without even the implied warranty of
		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		GNU General Public License for more details.

		A copy of the GNU General Public License is distributed along with
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file cublas_calls.h
		*
		* \brief C++ interface to CUBLAS calls.
		*
		*/

		#ifndef INCLUDE_CUBLAS_CALLS_H_
		#define INCLUDE_CUBLAS_CALLS_H_

		/*! \brief Invert a complex matrix with double precision elements.
		*
		* Use CUBLAS to perform an in-place matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param device_id: `int` ID of the device for matrix inversion offloading.
		*/
		void cublas_zinvert(dcomplex **mat, np_int n, int device_id);

		/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution
		*
		* Use CUBLAS to perform matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param maxrefiters: `int` Maximum number of refinement iterations to apply.
		* \param accuracygoal: `double &` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy.
		* \param refinemode: `int` Flag to choose the refinement mode.
		* \param device_id: `int` ID of the device for matrix inversion offloading.
		*/
		void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id);

		#endif

src/include/lapack_calls_h.bak

0 → 100644

+51 −0

Original line number	Diff line number	Diff line
		/* Copyright (C) 2025 INAF - Osservatorio Astronomico di Cagliari

		This program is free software: you can redistribute it and/or modify
		it under the terms of the GNU General Public License as published by
		the Free Software Foundation, either version 3 of the License, or
		(at your option) any later version.

		This program is distributed in the hope that it will be useful,
		but WITHOUT ANY WARRANTY; without even the implied warranty of
		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		GNU General Public License for more details.

		A copy of the GNU General Public License is distributed along with
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file lapack_calls.h
		*
		* \brief C++ interface to LAPACK calls.
		*
		*/

		#ifndef INCLUDE_LAPACK_CALLS_H_
		#define INCLUDE_LAPACK_CALLS_H_

		/*! \brief Invert a complex matrix with double precision elements.
		*
		* Use LAPACKE64 to perform an in-place matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param jer: `int &` Reference to an integer return flag.
		*/
		void zinvert(dcomplex **mat, np_int n, int &jer);

		/*! \brief Invert a complex matrix with double precision elements, using iterative refinement to improve accuracy.
		*
		* Use LAPACKE64 to perform an in-place matrix inversion for a complex
		* matrix with double precision elements.
		*
		* \param mat: Matrix of complex. The matrix to be inverted.
		* \param n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param jer: `int &` Reference to an integer return flag.
		* \param maxrefiters: `int` Maximum number of refinement iterations.
		* \param accuracygoal: `double &` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy.
		* \param refinemode: `int` Flag to control the refinement mode.
		*/
		void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxrefiters, double &accuracygoal, int refinemode);

		#endif

src/libnptm/cublas_calls_cpp.bak

0 → 100644

+232 −0

Original line number	Diff line number	Diff line
		/* Copyright (C) 2025 INAF - Osservatorio Astronomico di Cagliari

		This program is free software: you can redistribute it and/or modify
		it under the terms of the GNU General Public License as published by
		the Free Software Foundation, either version 3 of the License, or
		(at your option) any later version.

		This program is distributed in the hope that it will be useful,
		but WITHOUT ANY WARRANTY; without even the implied warranty of
		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		GNU General Public License for more details.

		A copy of the GNU General Public License is distributed along with
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file cublas_calls.cpp
		*
		* \brief Implementation of the interface with CUBLAS libraries.
		*/
		#ifndef INCLUDE_TYPES_H_
		#include "../include/types.h"
		#endif

		//#define USE_CUBLAS 1
		#ifdef USE_CUBLAS

		#ifndef INCLUDE_CUBLAS_CALLS_H_
		#include "../include/cublas_calls.h"
		#endif

		#include <limits>
		#include <cuda_runtime.h>
		#include <cublas_v2.h>

		#ifdef USE_ILP64
		#define CUZGEMM cublasZgemm_64
		#define CUZAXPY cublasZaxpy_64
		#define CUIZAMAX cublasIzamax_64
		#else
		#define CUZGEMM cublasZgemm
		#define CUZAXPY cublasZaxpy
		#define CUIZAMAX cublasIzamax
		#endif

		#define cudacall(call) \
		do \
		{ \
		cudaError_t err = (call); \
		if(cudaSuccess != err) \
		{ \
		fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
		cudaDeviceReset(); \
		exit(EXIT_FAILURE); \
		} \
		} \
		while (0)

		#define cublascall(call) \
		do \
		{ \
		cublasStatus_t status = (call); \
		if(CUBLAS_STATUS_SUCCESS != status) \
		{ \
		fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
		cudaDeviceReset(); \
		exit(EXIT_FAILURE); \
		} \
		\
		} \
		while(0)

		void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
		cudacall(cudaSetDevice(device_id));
		cublasHandle_t handle;
		cublascall(cublasCreate_v2(&handle));
		int batchsize = 1;
		int piv, info; // array of pivot indices
		np_int m = (np_int) n; // changed rows; a - mxm matrix
		np_int mm = m * m; // size of a
		cudacall(cudaMalloc<int>(&piv, mbatchsizesizeof(int)));
		cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
		cuDoubleComplex a = (cuDoubleComplex )&(mat[0][0]); // pointer to first element on host
		cuDoubleComplex *d_a; // pointer to first element on device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a,mmsizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
		cuDoubleComplex **batch_d_a;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_a,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_a, &d_a, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
		cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_c,mmsizeof(cuDoubleComplex)));
		cuDoubleComplex **batch_d_c;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_c,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_c, &d_c, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
		cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		cudaFree(batch_d_a);
		cudaFree(batch_d_c);
		cudaFree(piv);
		cudaFree(info);
		cudaFree(d_a);
		cudaFree(d_c);
		cublasDestroy_v2(handle);

		}

		void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) {
		cudacall(cudaSetDevice(device_id));
		cublasHandle_t handle;
		cublascall(cublasCreate_v2(&handle));
		int batchsize = 1;
		int piv, info; // array of pivot indices
		np_int m = (np_int) n; // changed rows; a - mxm matrix
		np_int mm = m * m; // size of a
		cudacall(cudaMalloc<int>(&piv, mbatchsizesizeof(int)));
		cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
		cuDoubleComplex a = (cuDoubleComplex )&(mat[0][0]); // pointer to first element on host
		cuDoubleComplex *d_a; // pointer to first element on device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a,mmsizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
		cuDoubleComplex **batch_d_a;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_a,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_a, &d_a, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
		cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
		cudacall(cudaMalloc<cuDoubleComplex>(&d_c,mmsizeof(cuDoubleComplex)));
		cuDoubleComplex **batch_d_c;
		cudacall(cudaMalloc<cuDoubleComplex>(&batch_d_c,batchsizesizeof(cuDoubleComplex*)));
		cudacall(cudaMemcpy(batch_d_c, &d_c, batchsizesizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
		cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
		//cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		cudaFree(batch_d_a);
		cudaFree(batch_d_c);
		cudaFree(piv);
		cudaFree(info);
		cuDoubleComplex *d_a_residual;
		cuDoubleComplex *d_a_refine;
		cuDoubleComplex *d_id;
		if (maxiters>0) {
		// copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one
		cudacall(cudaMemcpy(d_a, a, mmsizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, mmsizeof(cuDoubleComplex)));
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, mmsizeof(cuDoubleComplex)));
		// allocate memory for the temporary matrix products
		dcomplex *native_id = new dcomplex[1];
		native_id[0] = 1;
		cuDoubleComplex m_id = (cuDoubleComplex ) &(native_id[0]);
		// fill it with 1
		cudacall(cudaMalloc<cuDoubleComplex>(&d_id, sizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
		delete[] native_id; // free identity vector on host
		}
		bool iteraterefine = true;
		if (maxiters>0) {
		cuDoubleComplex cu_mone;
		(((double *) &(cu_mone))[0]) = -1;
		(((double *) &(cu_mone))[1]) = 0;
		cuDoubleComplex cu_one;
		(((double *) &(cu_one))[0]) = 1;
		(((double *) &(cu_one))[1]) = 0;
		cuDoubleComplex cu_zero;
		(((double *) &(cu_zero))[0]) = 0;
		(((double *) &(cu_zero))[1]) = 0;
		// multiply minus the original matrix times the inverse matrix
		// NOTE: factors in zgemm are swapped because zgemm is designed for column-major
		// Fortran-style arrays, whereas our arrays are C-style row-major.
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
		double oldmax=0;
		if (refinemode >0) {
		np_int maxindex;
		// find the maximum absolute value of the residual
		cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
		cuDoubleComplex cublasmax;
		// transfer the maximum value to the host
		cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
		// take the module
		oldmax = cabs( (((double ) &(cublasmax))[0]) + I(((double *) &(cublasmax))[1]));
		printf("Initial max residue = %g\n", oldmax);
		if (oldmax < accuracygoal) iteraterefine = false;
		}
		// begin correction loop (should iterate maxiters times)
		int iter;
		for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
		// multiply the inverse times the residual, add to the initial inverse
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_one, d_a_residual, m, d_c, m, &cu_zero, d_a_refine, m));
		// add to the initial inverse
		cublascall(CUZAXPY(handle, mm, &cu_one, d_a_refine, 1, d_c, 1));
		// multiply minus the original matrix times the new inverse matrix
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
		if ((refinemode==2) \|\| ((refinemode==1) && (iter == (maxiters-1)))) {
		// find the maximum absolute value of the residual
		np_int maxindex;
		cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
		// transfer the maximum value to the host
		cuDoubleComplex cublasmax;
		cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
		// take the module
		double newmax = cabs( (((double ) &(cublasmax))[0]) + I(((double *) &(cublasmax))[1]));
		printf("Max residue after %d iterations = %g\n", iter+1, newmax);
		// if the maximum in the residual decreased from the previous iteration,
		// update oldmax and go on, otherwise no point further iterating refinements
		if ((refinemode==2) && ((newmax > oldmax)\|\|(newmax < accuracygoal))) iteraterefine = false;
		oldmax = newmax;
		}
		}
		// if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy
		if (refinemode==2) maxiters = iter;
		accuracygoal = oldmax;
		// end correction loop
		}
		// copy the final refined inverted matrix back from device to host
		cudacall(cudaMemcpy(a,d_c,mmsizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
		// free temporary device arrays
		cudaFree(d_a);
		cudaFree(d_c);
		if (maxiters>0) {
		cudaFree(d_id);
		cudaFree(d_a_refine);
		cudaFree(d_a_residual);
		}

		cublasDestroy_v2(handle);

		}


		#endif

src/libnptm/lapack_calls_cpp.bak

0 → 100644

+196 −0

Original line number	Diff line number	Diff line
		/* Copyright (C) 2025 INAF - Osservatorio Astronomico di Cagliari

		This program is free software: you can redistribute it and/or modify
		it under the terms of the GNU General Public License as published by
		the Free Software Foundation, either version 3 of the License, or
		(at your option) any later version.

		This program is distributed in the hope that it will be useful,
		but WITHOUT ANY WARRANTY; without even the implied warranty of
		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		GNU General Public License for more details.

		A copy of the GNU General Public License is distributed along with
		this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
		*/

		/*! \file lapack_calls.cpp
		*
		* \brief Implementation of the interface with LAPACK libraries.
		*/
		#ifndef INCLUDE_TYPES_H_
		#include "../include/types.h"
		#endif

		/*
		#ifdef USE_LAPACK
		#ifdef USE_MKL
		#include <mkl_lapacke.h>
		#else
		#include <lapacke.h>
		#endif
		*/

		#ifdef USE_LAPACK

		#ifndef INCLUDE_LAPACK_CALLS_H_
		#include "../include/lapack_calls.h"
		#endif

		#include <limits>

		#ifdef USE_MKL
		extern "C" void zcopy_(np_int n, MKL_Complex16 arr1, np_int inc1, MKL_Complex16 arr2,
		np_int *inc2);
		extern "C" void zgemm_(char transa, char transb, np_int l, np_int m, np_int *n,
		MKL_Complex16 alpha, MKL_Complex16 a, np_int *lda,
		MKL_Complex16 b, np_int ldb, MKL_Complex16 *beta,
		MKL_Complex16 c, np_int ldc);
		extern "C" void zaxpy_(np_int n, MKL_Complex16 alpha, MKL_Complex16 arr1, np_int inc1,
		MKL_Complex16 arr2, np_int inc2);
		extern "C" np_int izamax_(np_int n, MKL_Complex16 arr1, np_int *inc1);
		#else
		extern "C" void zcopy_(np_int n, dcomplex arr1, np_int inc1, dcomplex arr2, np_int *inc2);
		extern "C" void zgemm_(char transa, char transb, np_int l, np_int m, np_int *n,
		dcomplex alpha, dcomplex a, np_int *lda,
		dcomplex b, np_int ldb, dcomplex *beta,
		dcomplex c, np_int ldc);
		extern "C" void zaxpy_(np_int n, dcomplex alpha, dcomplex arr1, np_int inc1,
		dcomplex arr2, np_int inc2);
		extern "C" np_int izamax_(np_int n, dcomplex arr1, np_int *inc1);
		#endif

		void zinvert(dcomplex **mat, np_int n, int &jer) {
		jer = 0;
		dcomplex *arr = &(mat[0][0]);
		const dcomplex uim = 0.0 + 1.0 * I;

		#ifdef USE_MKL
		MKL_Complex16 arr2 = (MKL_Complex16 ) arr;
		#endif

		np_int* IPIV = new np_int[n]();

		#ifdef USE_MKL
		LAPACKE_zgetrf(LAPACK_ROW_MAJOR, n, n, arr2, n, IPIV);
		LAPACKE_zgetri(LAPACK_ROW_MAJOR, n, arr2, n, IPIV);
		#else
		LAPACKE_zgetrf(LAPACK_ROW_MAJOR, n, n, arr, n, IPIV);
		LAPACKE_zgetri(LAPACK_ROW_MAJOR, n, arr, n, IPIV);
		#endif

		delete[] IPIV;
		}

		void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, double &accuracygoal, int refinemode) {

		jer = 0;
		#ifdef USE_MKL
		MKL_Complex16 arr = (MKL_Complex16 ) &(mat[0][0]);
		#else
		dcomplex *arr = &(mat[0][0]);
		#endif
		np_int nn = n*n;
		np_int incx = 1;
		np_int incx0 = 0;
		#ifdef USE_MKL
		MKL_Complex16 *arr_orig = NULL;
		MKL_Complex16 *arr_residual = NULL;
		MKL_Complex16 *arr_refine = NULL;
		MKL_Complex16 *id = NULL;
		#else
		dcomplex *arr_orig = NULL;
		dcomplex *arr_residual = NULL;
		dcomplex *arr_refine = NULL;
		dcomplex *id = NULL;
		#endif
		if (maxiters>0) {
		#ifdef USE_MKL
		arr_orig = new MKL_Complex16[nn];
		arr_residual = new MKL_Complex16[nn];
		arr_refine = new MKL_Complex16[nn];
		id = new MKL_Complex16[1];
		id[0].real = 1;
		id[0].imag = 0;
		#else
		arr_orig = new dcomplex[nn];
		arr_residual = new dcomplex[nn];
		arr_refine = new dcomplex[nn];
		id = new dcomplex[1];
		id[0] = (dcomplex) 1;
		#endif
		zcopy_(&nn, arr, &incx, arr_orig, &incx);
		}
		// const dcomplex uim = 0.0 + 1.0 * I;

		np_int* IPIV = new np_int[n]();

		LAPACKE_zgetrf(LAPACK_ROW_MAJOR, n, n, arr, n, IPIV);
		LAPACKE_zgetri(LAPACK_ROW_MAJOR, n, arr, n, IPIV);
		delete[] IPIV;

		if (maxiters>0) {
		bool iteraterefine = true;
		char transa = 'N';
		#ifdef USE_MKL
		MKL_Complex16 dczero;
		dczero.real = 0;
		dczero.imag = 0;
		MKL_Complex16 dcone;
		dcone.real = 1;
		dcone.imag = 0;
		MKL_Complex16 dcmone;
		dcmone.real = -1;
		dcmone.imag = 0;
		#else
		dcomplex dczero = 0;
		dcomplex dcone = 1;
		dcomplex dcmone = -1;
		#endif
		// multiply minus the original matrix times the inverse matrix
		// NOTE: factors in zgemm are swapped because zgemm is designed for column-major
		// Fortran-style arrays, whereas our arrays are C-style row-major.
		zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
		np_int incy = n+1;
		zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
		double oldmax = 0;
		if (refinemode >0) {
		np_int maxindex = izamax_(&nn, arr_residual, &incx);
		#ifdef USE_MKL
		oldmax = cabs(arr_residual[maxindex].real + I*arr_residual[maxindex].imag);
		#else
		oldmax = cabs(arr_residual[maxindex]);
		#endif
		printf("Initial max residue = %g\n", oldmax);
		if (oldmax < accuracygoal) iteraterefine = false;
		}
		int iter;
		for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
		zgemm_(&transa, &transa, &n, &n, &n, &dcone, arr_residual, &n, arr, &n, &dczero, arr_refine, &n);
		zaxpy_(&nn, &dcone, arr_refine, &incx, arr, &incx);
		// zcopy_(&nn, arr_refine, &incx, arr, &incx);
		zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
		zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
		if ((refinemode==2) \|\| ((refinemode==1) && (iter == (maxiters-1)))) {
		np_int maxindex = izamax_(&nn, arr_residual, &incx);
		#ifdef USE_MKL
		double newmax = cabs(arr_residual[maxindex].real + I*arr_residual[maxindex].imag);
		#else
		double newmax = cabs(arr_residual[maxindex]);
		#endif
		printf("Max residue after %d iterations = %g\n", iter+1, newmax);
		if ((refinemode==2) && ((newmax > oldmax)\|\|(newmax < accuracygoal))) iteraterefine = false;
		oldmax = newmax;
		}
		}
		if (refinemode==2) maxiters = iter;
		accuracygoal = oldmax;
		delete[] id;
		delete[] arr_refine;
		delete[] arr_orig;
		delete[] arr_residual;
		}

		}

		#endif