Commit 35cd635f authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

use less memory for identity subtraction in iterative refinement

parent 7b044c50
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -143,12 +143,12 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, m*m*sizeof(cuDoubleComplex)));
    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, m*m*sizeof(cuDoubleComplex)));
    // allocate memory for the temporary matrix products
    dcomplex *native_id = new dcomplex[m];
    for (np_int i=0; i<m; i++) native_id[i] = 1;
    dcomplex *native_id = new dcomplex[1];
    native_id[0] = 1;
    cuDoubleComplex *m_id = (cuDoubleComplex *) &(native_id[0]);
    // fill it with 1
    cudacall(cudaMalloc<cuDoubleComplex>(&d_id, m*sizeof(cuDoubleComplex)));
    cudacall(cudaMemcpy(d_id, m_id, m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
    cudacall(cudaMalloc<cuDoubleComplex>(&d_id, sizeof(cuDoubleComplex)));
    cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
    delete[] native_id; // free identity vector on host
  }
  bool iteraterefine = true;
@@ -167,7 +167,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
    // Fortran-style arrays, whereas our arrays are C-style row-major.
    cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
    // add the identity to the product
    cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
    cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
    double oldmax=0;
    if (refinemode >0) {
      np_int maxindex;
@@ -191,7 +191,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
      // multiply minus the original matrix times the new inverse matrix
      cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
      // add the identity to the product
      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
	// find the maximum absolute value of the residual
	np_int maxindex;
+8 −9
Original line number Diff line number Diff line
@@ -92,6 +92,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
#endif
  np_int nn = n*n;
  np_int incx = 1;
  np_int incx0 = 0;
#ifdef USE_MKL
  MKL_Complex16 *arr_orig = NULL;
  MKL_Complex16 *arr_residual = NULL;
@@ -108,17 +109,15 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
    arr_orig = new MKL_Complex16[nn];
    arr_residual = new MKL_Complex16[nn];
    arr_refine = new MKL_Complex16[nn];
    id = new MKL_Complex16[n];
    for (np_int i=0; i<n ; i++) {
      id[i].real =  1;
      id[i].imag =  0;
    }
    id = new MKL_Complex16[1];
    id[0].real =  1;
    id[0].imag =  0;
#else
    arr_orig = new dcomplex[nn];
    arr_residual = new dcomplex[nn];
    arr_refine = new dcomplex[nn];
    id = new dcomplex[n];
    for (np_int i=0; i<n ; i++) id[i] = (dcomplex) 1;
    id = new dcomplex[1];
    id[0] = (dcomplex) 1;
#endif
    zcopy_(&nn, arr, &incx, arr_orig, &incx);
  }
@@ -153,7 +152,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
    // Fortran-style arrays, whereas our arrays are C-style row-major.
    zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
    np_int incy = n+1;
    zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy);
    zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
    double oldmax = 0;
    if (refinemode >0) {
      np_int maxindex = izamax_(&nn, arr_residual, &incx);
@@ -171,7 +170,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
      zaxpy_(&nn, &dcone, arr_refine, &incx, arr, &incx);
	// zcopy_(&nn, arr_refine, &incx, arr, &incx);
      zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
      zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy);
      zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
	np_int maxindex = izamax_(&nn, arr_residual, &incx);
#ifdef USE_MKL
+6 −6
Original line number Diff line number Diff line
@@ -132,16 +132,16 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
    }
    // allocate memory for the identity vector on the host
    {
      dcomplex *native_id = new dcomplex[m];
      for (magma_int_t i=0; i<m; i++) native_id[i] = 1;
      dcomplex *native_id = new dcomplex[1];
      native_id[0] = 1;
      magmaDoubleComplex *id = (magmaDoubleComplex *) &(native_id[0]);
      // fill it with 1
      err = magma_zmalloc(&d_id, m);
      err = magma_zmalloc(&d_id, 1);
      if (err != MAGMA_SUCCESS) {
	printf("Error allocating d_id\n");
	exit(1);
      }
      magma_zsetvector(m, id, 1, d_id, 1, queue); // copy identity to device vector
      magma_zsetvector(1, id, 1, d_id, 1, queue); // copy identity to device vector
      delete[] native_id; // free identity vector on host
    }
  }
@@ -161,7 +161,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
    // Fortran-style arrays, whereas our arrays are C-style row-major.
    magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m,  magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue);
    // add the identity to the product
    magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue);
    magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue);
    double oldmax=0;
    if (refinemode >0) {
      // find the maximum absolute value of the residual
@@ -184,7 +184,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
      // multiply minus the original matrix times the new inverse matrix
      magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m, magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue);
      // add the identity to the product
      magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue);
      magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue);
      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
	// find the maximum absolute value of the residual
	magma_int_t maxindex = magma_izamax(mm, d_a_residual, 1, queue);