Commit 78d49a34 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Create back-up files of old LAPACK and cuBLAS calls

parent d64edcbb
Loading
Loading
Loading
Loading
+51 −0
Original line number Diff line number Diff line
/* Copyright (C) 2025   INAF - Osservatorio Astronomico di Cagliari

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   A copy of the GNU General Public License is distributed along with
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file cublas_calls.h
 *
 * \brief C++ interface to CUBLAS calls.
 *
 */

#ifndef INCLUDE_CUBLAS_CALLS_H_
#define INCLUDE_CUBLAS_CALLS_H_

/*! \brief Invert a complex matrix with double precision elements.
 *
 * Use CUBLAS to perform an in-place matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param device_id: `int` ID of the device for matrix inversion offloading.
 */
void cublas_zinvert(dcomplex **mat, np_int n, int device_id);

/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution
 *
 * Use CUBLAS to perform matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param maxrefiters: `int` Maximum number of refinement iterations to apply.
 * \param accuracygoal: `double &` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy.
 * \param refinemode: `int` Flag to choose the refinement mode.
 * \param device_id: `int` ID of the device for matrix inversion offloading.
 */
void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id);

#endif
+51 −0
Original line number Diff line number Diff line
/* Copyright (C) 2025   INAF - Osservatorio Astronomico di Cagliari

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   A copy of the GNU General Public License is distributed along with
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file lapack_calls.h
 *
 * \brief C++ interface to LAPACK calls.
 *
 */

#ifndef INCLUDE_LAPACK_CALLS_H_
#define INCLUDE_LAPACK_CALLS_H_

/*! \brief Invert a complex matrix with double precision elements.
 *
 * Use LAPACKE64 to perform an in-place matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param jer: `int &` Reference to an integer return flag.
 */
void zinvert(dcomplex **mat, np_int n, int &jer);

/*! \brief Invert a complex matrix with double precision elements, using iterative refinement to improve accuracy.
 *
 * Use LAPACKE64 to perform an in-place matrix inversion for a complex
 * matrix with double precision elements.
 *
 * \param mat: Matrix of complex. The matrix to be inverted.
 * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param jer: `int &` Reference to an integer return flag.
 * \param maxrefiters: `int` Maximum number of refinement iterations.
 * \param accuracygoal: `double &` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy.
 * \param refinemode: `int` Flag to control the refinement mode.
 */
void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxrefiters, double &accuracygoal, int refinemode);

#endif
+232 −0
Original line number Diff line number Diff line
/* Copyright (C) 2025   INAF - Osservatorio Astronomico di Cagliari

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   A copy of the GNU General Public License is distributed along with
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file cublas_calls.cpp
 *
 * \brief Implementation of the interface with CUBLAS libraries.
 */
#ifndef INCLUDE_TYPES_H_
#include "../include/types.h"
#endif

//#define USE_CUBLAS 1
#ifdef USE_CUBLAS

#ifndef INCLUDE_CUBLAS_CALLS_H_
#include "../include/cublas_calls.h"
#endif

#include <limits>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#ifdef USE_ILP64
#define CUZGEMM cublasZgemm_64
#define CUZAXPY cublasZaxpy_64
#define CUIZAMAX cublasIzamax_64
#else
#define CUZGEMM cublasZgemm
#define CUZAXPY cublasZaxpy
#define CUIZAMAX cublasIzamax
#endif

#define cudacall(call)							\
  do									\
    {									\
      cudaError_t err = (call);						\
      if(cudaSuccess != err)						\
        {								\
	  fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
	  cudaDeviceReset();						\
	  exit(EXIT_FAILURE);						\
        }								\
    }									\
  while (0)

#define cublascall(call)						\
  do									\
    {									\
      cublasStatus_t status = (call);					\
      if(CUBLAS_STATUS_SUCCESS != status)				\
        {								\
	  fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
	  cudaDeviceReset();						\
	  exit(EXIT_FAILURE);						\
        }								\
      									\
    }									\
  while(0)

void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
  cudacall(cudaSetDevice(device_id));
  cublasHandle_t handle;
  cublascall(cublasCreate_v2(&handle));
  int batchsize = 1;
  int *piv, *info; // array of pivot indices
  np_int m = (np_int) n; // changed rows; a - mxm matrix
  np_int mm = m * m; // size of a
  cudacall(cudaMalloc<int>(&piv, m*batchsize*sizeof(int)));
  cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
  cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
  cuDoubleComplex *d_a; // pointer to first element on device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
  cuDoubleComplex **batch_d_a;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
  cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
  cuDoubleComplex **batch_d_c;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
  cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  cudaFree(batch_d_a);
  cudaFree(batch_d_c);
  cudaFree(piv);
  cudaFree(info);
  cudaFree(d_a);
  cudaFree(d_c);
  cublasDestroy_v2(handle);

}

void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) {
  cudacall(cudaSetDevice(device_id));
  cublasHandle_t handle;
  cublascall(cublasCreate_v2(&handle));
  int batchsize = 1;
  int *piv, *info; // array of pivot indices
  np_int m = (np_int) n; // changed rows; a - mxm matrix
  np_int mm = m * m; // size of a
  cudacall(cudaMalloc<int>(&piv, m*batchsize*sizeof(int)));
  cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
  cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
  cuDoubleComplex *d_a; // pointer to first element on device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
  cuDoubleComplex **batch_d_a;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
  cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
  cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
  cuDoubleComplex **batch_d_c;
  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
  cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
  cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
  //cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  cudaFree(batch_d_a);
  cudaFree(batch_d_c);
  cudaFree(piv);
  cudaFree(info);
  cuDoubleComplex *d_a_residual;
  cuDoubleComplex *d_a_refine;
  cuDoubleComplex *d_id;
  if (maxiters>0) {
    // copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one
    cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use  
    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, m*m*sizeof(cuDoubleComplex)));
    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, m*m*sizeof(cuDoubleComplex)));
    // allocate memory for the temporary matrix products
    dcomplex *native_id = new dcomplex[1];
    native_id[0] = 1;
    cuDoubleComplex *m_id = (cuDoubleComplex *) &(native_id[0]);
    // fill it with 1
    cudacall(cudaMalloc<cuDoubleComplex>(&d_id, sizeof(cuDoubleComplex)));
    cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
    delete[] native_id; // free identity vector on host
  }
  bool iteraterefine = true;
  if (maxiters>0) {
    cuDoubleComplex cu_mone;
    (((double *) &(cu_mone))[0]) = -1;
    (((double *) &(cu_mone))[1]) = 0;
    cuDoubleComplex cu_one;
    (((double *) &(cu_one))[0]) = 1;
    (((double *) &(cu_one))[1]) = 0;
    cuDoubleComplex cu_zero;
    (((double *) &(cu_zero))[0]) = 0;
    (((double *) &(cu_zero))[1]) = 0;
    // multiply minus the original matrix times the inverse matrix
    // NOTE: factors in zgemm are swapped because zgemm is designed for column-major
    // Fortran-style arrays, whereas our arrays are C-style row-major.
    cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
    // add the identity to the product
    cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
    double oldmax=0;
    if (refinemode >0) {
      np_int maxindex;
      // find the maximum absolute value of the residual
      cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
      cuDoubleComplex cublasmax;
      // transfer the maximum value to the host
      cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
      // take the module
      oldmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
      printf("Initial max residue = %g\n", oldmax);
      if (oldmax < accuracygoal) iteraterefine = false;
    }
    // begin correction loop (should iterate maxiters times)
    int iter;
    for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
      // multiply the inverse times the residual, add to the initial inverse
      cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_one, d_a_residual, m, d_c, m, &cu_zero, d_a_refine, m));
      // add to the initial inverse
      cublascall(CUZAXPY(handle, mm, &cu_one, d_a_refine, 1, d_c, 1));
      // multiply minus the original matrix times the new inverse matrix
      cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
      // add the identity to the product
      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
	// find the maximum absolute value of the residual
	np_int maxindex;
	cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
	// transfer the maximum value to the host
	cuDoubleComplex cublasmax;
	cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
	// take the module
	double newmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
	printf("Max residue after %d iterations = %g\n", iter+1, newmax);
	// if the maximum in the residual decreased from the previous iteration,
	// update oldmax and go on, otherwise no point further iterating refinements
	if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false;
	oldmax = newmax;
      }
    }
    // if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy
    if (refinemode==2) maxiters = iter;
    accuracygoal = oldmax;
    // end correction loop
  }
  // copy the final refined inverted matrix back from device to host
  cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
  // free temporary device arrays 
  cudaFree(d_a);
  cudaFree(d_c);
  if (maxiters>0) {
    cudaFree(d_id);
    cudaFree(d_a_refine);
    cudaFree(d_a_residual);
  }

  cublasDestroy_v2(handle);

}


#endif
+196 −0
Original line number Diff line number Diff line
/* Copyright (C) 2025   INAF - Osservatorio Astronomico di Cagliari

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   A copy of the GNU General Public License is distributed along with
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file lapack_calls.cpp
 *
 * \brief Implementation of the interface with LAPACK libraries.
 */
#ifndef INCLUDE_TYPES_H_
#include "../include/types.h"
#endif

/*
#ifdef USE_LAPACK
#ifdef USE_MKL
#include <mkl_lapacke.h>
#else
#include <lapacke.h>
#endif
*/

#ifdef USE_LAPACK

#ifndef INCLUDE_LAPACK_CALLS_H_
#include "../include/lapack_calls.h"
#endif

#include <limits>

#ifdef USE_MKL
  extern "C" void zcopy_(np_int *n, MKL_Complex16 *arr1, np_int *inc1, MKL_Complex16 *arr2,
		     np_int *inc2);
  extern "C" void zgemm_(char *transa, char *transb, np_int *l, np_int *m, np_int *n,
		     MKL_Complex16 *alpha, MKL_Complex16 *a, np_int *lda,
		     MKL_Complex16 *b, np_int *ldb, MKL_Complex16 *beta,
		     MKL_Complex16 *c, np_int *ldc);
  extern "C" void zaxpy_(np_int *n, MKL_Complex16 *alpha, MKL_Complex16 *arr1, np_int *inc1,
		     MKL_Complex16 *arr2, np_int *inc2);
  extern "C" np_int izamax_(np_int *n, MKL_Complex16 *arr1, np_int *inc1);
#else
  extern "C" void zcopy_(np_int *n, dcomplex *arr1, np_int *inc1, dcomplex *arr2, np_int *inc2);
  extern "C" void zgemm_(char *transa, char *transb, np_int *l, np_int *m, np_int *n,
		     dcomplex *alpha, dcomplex *a, np_int *lda,
		     dcomplex *b, np_int *ldb, dcomplex *beta,
		     dcomplex *c, np_int *ldc);
  extern "C" void zaxpy_(np_int *n, dcomplex *alpha, dcomplex *arr1, np_int *inc1,
		     dcomplex *arr2, np_int *inc2);
  extern "C" np_int izamax_(np_int *n, dcomplex *arr1, np_int *inc1);
#endif

void zinvert(dcomplex **mat, np_int n, int &jer) {
  jer = 0;
  dcomplex *arr = &(mat[0][0]);
  const dcomplex uim = 0.0 + 1.0 * I;

#ifdef USE_MKL
  MKL_Complex16 *arr2 = (MKL_Complex16 *) arr;
#endif
  
  np_int* IPIV = new np_int[n]();
  
#ifdef USE_MKL
  LAPACKE_zgetrf(LAPACK_ROW_MAJOR, n, n, arr2, n, IPIV);
  LAPACKE_zgetri(LAPACK_ROW_MAJOR, n, arr2, n, IPIV);
#else
  LAPACKE_zgetrf(LAPACK_ROW_MAJOR, n, n, arr, n, IPIV);
  LAPACKE_zgetri(LAPACK_ROW_MAJOR, n, arr, n, IPIV);
#endif

  delete[] IPIV;
}

void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, double &accuracygoal, int refinemode) {

  jer = 0;
#ifdef USE_MKL
  MKL_Complex16 *arr = (MKL_Complex16 *) &(mat[0][0]);
#else
  dcomplex *arr = &(mat[0][0]);
#endif
  np_int nn = n*n;
  np_int incx = 1;
  np_int incx0 = 0;
#ifdef USE_MKL
  MKL_Complex16 *arr_orig = NULL;
  MKL_Complex16 *arr_residual = NULL;
  MKL_Complex16 *arr_refine = NULL;
  MKL_Complex16 *id = NULL;
#else
  dcomplex *arr_orig = NULL;
  dcomplex *arr_residual = NULL;
  dcomplex *arr_refine = NULL;
  dcomplex *id = NULL;
#endif
  if (maxiters>0) {
#ifdef USE_MKL
    arr_orig = new MKL_Complex16[nn];
    arr_residual = new MKL_Complex16[nn];
    arr_refine = new MKL_Complex16[nn];
    id = new MKL_Complex16[1];
    id[0].real =  1;
    id[0].imag =  0;
#else
    arr_orig = new dcomplex[nn];
    arr_residual = new dcomplex[nn];
    arr_refine = new dcomplex[nn];
    id = new dcomplex[1];
    id[0] = (dcomplex) 1;
#endif
    zcopy_(&nn, arr, &incx, arr_orig, &incx);
  }
  // const dcomplex uim = 0.0 + 1.0 * I;
  
  np_int* IPIV = new np_int[n]();
  
  LAPACKE_zgetrf(LAPACK_ROW_MAJOR, n, n, arr, n, IPIV);
  LAPACKE_zgetri(LAPACK_ROW_MAJOR, n, arr, n, IPIV);
  delete[] IPIV;

  if (maxiters>0) {
    bool iteraterefine = true;
    char transa = 'N';
#ifdef USE_MKL
    MKL_Complex16 dczero;
    dczero.real = 0;
    dczero.imag = 0;
    MKL_Complex16 dcone;
    dcone.real = 1;
    dcone.imag = 0;
    MKL_Complex16 dcmone;
    dcmone.real = -1;
    dcmone.imag = 0;
#else
    dcomplex dczero = 0;
    dcomplex dcone = 1;
    dcomplex dcmone = -1;
#endif
    // multiply minus the original matrix times the inverse matrix
    // NOTE: factors in zgemm are swapped because zgemm is designed for column-major
    // Fortran-style arrays, whereas our arrays are C-style row-major.
    zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
    np_int incy = n+1;
    zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
    double oldmax = 0;
    if (refinemode >0) {
      np_int maxindex = izamax_(&nn, arr_residual, &incx);
#ifdef USE_MKL
      oldmax = cabs(arr_residual[maxindex].real + I*arr_residual[maxindex].imag);
#else
      oldmax = cabs(arr_residual[maxindex]);
#endif
      printf("Initial max residue = %g\n", oldmax);
      if (oldmax < accuracygoal) iteraterefine = false;
    }
    int iter;
    for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
      zgemm_(&transa, &transa, &n, &n, &n, &dcone, arr_residual, &n, arr, &n, &dczero, arr_refine, &n);
      zaxpy_(&nn, &dcone, arr_refine, &incx, arr, &incx);
	// zcopy_(&nn, arr_refine, &incx, arr, &incx);
      zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
      zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
	np_int maxindex = izamax_(&nn, arr_residual, &incx);
#ifdef USE_MKL
	double newmax = cabs(arr_residual[maxindex].real + I*arr_residual[maxindex].imag);
#else
	double newmax = cabs(arr_residual[maxindex]);
#endif
	printf("Max residue after %d iterations = %g\n", iter+1, newmax);
	if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false;
	oldmax = newmax; 
      }
    }
    if (refinemode==2) maxiters = iter;
    accuracygoal = oldmax;
    delete[] id;
    delete[] arr_refine;
    delete[] arr_orig;
    delete[] arr_residual;
  }

}

#endif