Commit 08427feb authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Recover MAGMA implementation of refined GESV and RBT inversion

parent f86fc6c8
Loading
Loading
Loading
Loading
+143 −9
Original line number Diff line number Diff line
@@ -21,7 +21,8 @@

#ifdef USE_MAGMA

#include <limits>
//#include <limits>
#include <cstdio>
#include <string>

#ifndef INCLUDE_TYPES_H_
@@ -60,8 +61,9 @@
using namespace std;

void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const RuntimeSettings& rs) {
  dcomplex *vec_m = mat[0]; // Vector form of the input matrix.
  magmaDoubleComplex *a = (magmaDoubleComplex *)mat[0]; // pointer to first element on host
  string message;
  char buffer[128];
  magma_int_t err = MAGMA_SUCCESS;
  magma_queue_t queue = NULL;
  magma_device_t dev = (magma_device_t)device_id;
@@ -69,12 +71,12 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt
  magma_int_t info;
  magma_int_t m = (magma_int_t)n; // changed rows; a - mxm matrix
  magma_int_t mm = m * m; // size of a
  magmaDoubleComplex *a = (magmaDoubleComplex *)vec_m; // pointer to first element on host
  const magmaDoubleComplex magma_zero = MAGMA_Z_MAKE(0.0, 0.0);
  const magmaDoubleComplex magma_one = MAGMA_Z_MAKE(1.0, 0.0);
  const magmaDoubleComplex magma_mone = MAGMA_Z_MAKE(-1.0, 0.0);
  const magmaDoubleComplex magma_two = MAGMA_Z_MAKE(2.0, 0.0);
  if (rs.invert_mode == RuntimeSettings::INV_MODE_LU) { 
    // >>> LU INVERSION <<<
    magmaDoubleComplex *d_a; // pointer to first element on device
    magmaDoubleComplex *dwork; // work space pointer on device
    magma_int_t ldwork = m * magma_get_zgetri_nb(m); // optimal block size
@@ -95,18 +97,48 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt
    magma_zgetrf_gpu(m, m, d_a, m, piv, &info);
    magma_zgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
    delete[] piv; // delete piv created by magma_zgetrf()
    magma_free(dwork);
    if (rs.use_refinement) {
      err = magma_newton(rs, a, m, d_a, queue);
    }
    magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a
    magma_free(d_a);
    magma_free(dwork);
    // >>> END OF LU INVERSION <<<
  } else if (rs.invert_mode == RuntimeSettings::INV_MODE_GESV) {
    // >>> GESV INVERSION <<<
    magmaDoubleComplex *h_id = new magmaDoubleComplex[mm]();
    magmaDoubleComplex *d_a, *d_id;
    magma_int_t *piv = new magma_int_t[m];
    for (magma_int_t i = 0; i < m; i++) {
      h_id[i * (m + 1)] = magma_one;
    }
    err = magma_zmalloc(&d_id, mm);
    if (err != MAGMA_SUCCESS) {
      message = "ERROR: could not allocate d_id!\n";
      rs.logger->err(message);
      exit(1);
    }
    magma_zsetmatrix(m, m, h_id, m, d_id , m, queue); // copy identity matrix to device
    delete[] h_id;
    // allocate matrix on device
    err = magma_zmalloc(&d_a, mm); // device memory for a
    if (err != MAGMA_SUCCESS) {
      message = "ERROR: could not allocate d_a!\n";
      rs.logger->err(message);
      exit(1);
    }
    magma_zsetmatrix(m, m, a, m, d_a , m, queue); // copy a -> d_a
    magma_zgesv_gpu(m, m, d_a, m, piv, d_id, m, &info);
    delete[] piv; // free host memory
    magma_free(d_a);
    if (rs.use_refinement) {
      // refinement block
      printf("DEBUG: would refine GESV inversion.\n");
      err = magma_newton(rs, a, m, d_id, queue);
    }
    magma_zgetmatrix(m, m, d_id , m, a, m, queue); // copy d_id -> a
    magma_free(d_id);
    // >>> END OF GESV INVERSION <<<
  } else if (rs.invert_mode == RuntimeSettings::INV_MODE_RBT) {
    // >>> RBT INVERSION <<<
    magmaDoubleComplex *d_a; // pointer to first element on device
    magma_bool_t iterate = (magma_bool_t)rs.use_refinement;
    magma_int_t *piv = new magma_int_t[m]; // host mem.
@@ -126,7 +158,106 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt
    delete[] h_id;
    delete[] piv;
    magma_free(d_a); 
    // >>> END OF RBT INVERSION <<<
  } else if (rs.invert_mode == RuntimeSettings::INV_MODE_SVD) {
    // >>> SVD INVERSION <<<
    // Step 1: compute singular value decomposition
    magma_int_t lwork;
    magmaDoubleComplex *h_work = new magmaDoubleComplex[m];
    magma_zgesdd(
      MagmaAllVec, m, m, NULL, m, NULL, NULL, m, NULL, m, h_work, -1, NULL, NULL, &info
    );
    if (info == 0) {
      lwork = (magma_int_t)(MAGMA_Z_REAL(h_work[0]));
      message = "DEBUG: lwork = " + to_string(lwork) + "\n";
      rs.logger->log(message, LOG_DEBG);
    } else {
      message = "ERROR: magma_zgesdd failed on resource querying call!\n";
      rs.logger->err(message);
      exit(1);
    }
    delete[] h_work;
    h_work = new magmaDoubleComplex[lwork];
    magmaDoubleComplex *h_a, *h_U, *h_Vt;
    h_a = new magmaDoubleComplex[mm];
    memcpy(h_a, a, mm * sizeof(magmaDoubleComplex));
    h_U = new magmaDoubleComplex[mm];
    h_Vt = new magmaDoubleComplex[mm];
    double *h_s = new double[m];
    double *h_rwork = new double[5 * mm + 7 * m];
    magma_int_t *h_iwork = new magma_int_t[8 * m];
    magma_zgesdd(
      MagmaAllVec, m, m, h_a, m, h_s, h_U, m, h_Vt, m, h_work, lwork, h_rwork, h_iwork, &info
    );
    delete[] h_work;
    delete[] h_rwork;
    delete[] h_iwork;
    delete[] h_a;
    sprintf(buffer, "%.5le", h_s[0]);
    message = "DEBUG: s[0] = ";
    message += buffer;
    message += "; s[";
    message += to_string(m - 1);
    message += "] = ";
    sprintf(buffer, "%.5le", h_s[m - 1]);
    message += buffer;
    message += ".\n";
    rs.logger->log(message, LOG_DEBG);
    
    // Step 2: Upload decomposed matix to GPU
    magmaDoubleComplex *d_U, *d_Vt, *d_a;
    err = magma_zmalloc(&d_Vt, mm);
    if (err != MAGMA_SUCCESS) {
      message = "ERROR: could not allocate d_Vt!\n";
      rs.logger->err(message);
      exit(1);
    }
    magma_zsetmatrix(m, m, h_Vt, m, d_Vt, m, queue);
    err = magma_zmalloc(&d_U, mm);
    if (err != MAGMA_SUCCESS) {
      message = "ERROR: could not allocate d_U!\n";
      rs.logger->err(message);
      exit(1);
    }
    magma_zsetmatrix(m, m, h_U, m, d_U, m, queue);
    err = magma_zmalloc(&d_a, mm);
    if (err != MAGMA_SUCCESS) {
      message = "ERROR: could not allocate d_a!\n";
      rs.logger->err(message);
      exit(1);
    }

    // Step 3: compute threshold-truncated pseudo-inverse on GPU
    double eps_mach = 2.22e-18;
    double threshold = h_s[0] * m * eps_mach; 
    for (magma_int_t i = 0; i < m; i++) {
      magmaDoubleComplex inv_s = (h_s[i] > threshold) ? 
	MAGMA_Z_MAKE(1.0 / h_s[i], 0.0) : 
	MAGMA_Z_MAKE(0.0, 0.0);
      magma_zscal(m, inv_s, d_Vt + i, m, queue);
    }
    delete[] h_s;
    // d_a = d_vt^H * d_u^H
    magmablas_zgemm(
      MagmaConjTrans, MagmaConjTrans, m, m, m, magma_one, d_Vt, m, d_U, m, 
      magma_zero, d_a, m, queue
    );

    // Step 4: release matrix decomposition
    magma_free(d_U);
    magma_free(d_Vt);
    delete[] h_U;
    delete[] h_Vt;

    // Step 5: refine inversion
    if (rs.use_refinement) {
      err = magma_newton(rs, a, m, d_a, queue);
    }

    // Step 6: get result back to host
    magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a
    magma_free(d_a);
    // >>> END OF SVD INVERSION <<<
  }
  magma_queue_destroy(queue); // destroy queue
  jer = (int)err;
@@ -138,6 +269,7 @@ magma_int_t magma_newton(
) {
  magma_int_t err;
  string message;
  char buffer[128];
  const int max_ref_iters = rs.max_ref_iters;
  const magmaDoubleComplex magma_zero = MAGMA_Z_MAKE(0.0, 0.0);
  const magmaDoubleComplex magma_one = MAGMA_Z_MAKE(1.0, 0.0);
@@ -195,7 +327,9 @@ magma_int_t magma_newton(
    // Compute the norm of R
    norm = magma_dznrm2(mm, d_axa, 1, queue);
    double relative_res = norm / normA;
    message = "INFO: refinement iteration " + to_string(ri) + " achieved " + to_string(relative_res) + ".\n";
    sprintf(buffer, "%.5le", relative_res);
    message = "INFO: refinement iteration " + to_string(ri) + " achieved relative residual of "
      + buffer + ".\n";
    rs.logger->log(message);
    if (norm < 0.99 * old_norm) {
      // Transform d_ax in 2*I - A*X
@@ -209,7 +343,7 @@ magma_int_t magma_newton(
      // Set d_a = X*(2*I - A*X)
      magma_zcopy(mm, d_axa, 1, d_a, 1, queue);
      old_norm = norm;
      if (relative_res < 1.0e-14) {
      if (relative_res < 1.0e-16) {
	message = "DEBUG: good news - optimal convergence achieved. Stopping.\n";
	rs.logger->log(message, LOG_DEBG);
	break; // ri for