Commit 29407bf2 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Modify CLUSTER to perform multi-centered matrix initialization and inversion...

Modify CLUSTER to perform multi-centered matrix initialization and inversion with a single GPU transfer
parent af01e8e8
Loading
Loading
Loading
Loading
+64 −7
Original line number Diff line number Diff line
@@ -886,16 +886,72 @@ int cluster_jxi488_cycle(
  delete outam0;
#endif // DEBUG_AM
#ifdef USE_TARGET_OFFLOAD
  if (rs.use_offload) {
#ifdef USE_MAGMA
    magmaDoubleComplex* vec_am = (magmaDoubleComplex *)(cid->am[0]);
  int device_id = cid->proc_device;
#pragma omp target data map(tofrom: vec_am[0:ndit*ndit]) device(device_id)
    magma_queue_t queue = NULL;
    magma_device_t device_id = (magma_device_t)(cid->proc_device);
    magma_queue_create(device_id, &queue);
#pragma omp target data map(to: vec_am[0:ndit*ndit]) device(cid->proc_device)
    {
    magma_cms(vec_am, cid->c1, cid->proc_device);
      magma_cms(vec_am, cid->c1, cid->proc_device); // Initialize uninverted matrix on GPU

      /*
      // DEBUG STEP
#pragma omp target update from(vec_am[0:ndit*ndit]) device(cid->proc_device)
      message = "c_magma_cms_jxi_" + to_string(jxi488) + ".ppm";
      write_matrix_as_ppm(cid->am[0], ndit, ndit, message, "MAG", 1, 1);
      {
	VirtualAsciiFile *outam1 = new VirtualAsciiFile();
	string outam1_name = output_path + "/c_MAGMA_AM1_JXI" + to_string(jxi488) + ".txt";
	sprintf(virtual_line, " AM matrix after CMS before LUCIN\n");
	outam1->append_line(virtual_line);
	sprintf(virtual_line, " %d\n", ndit);
	outam1->append_line(virtual_line);  
	sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
	outam1->append_line(virtual_line);
	write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1);
	outam1->write_to_disk(outam1_name);
	delete outam1;
      }
      */

#pragma omp target data use_device_ptr(vec_am) device(cid->proc_device)
      {
	magma_zinvert_resident(vec_am, (magma_int_t)ndit, jer, queue, cid->proc_device, rs);
	magma_queue_sync(queue);
      }
      if (jer == 0) {
	// Get the final inverted matrix.
#pragma omp target update from(vec_am[0:ndit*ndit]) device(cid->proc_device)
      } else {
	sprintf(virtual_line, "ERROR: matrix inversion returned code %d!\n", jer);
	message = virtual_line;
	logger->err(message);
      }
    }
    magma_queue_destroy(queue);
#else // NO_USE_MAGMA
    cms_flat(cid->am[0], cid->c1);
#endif // USE_MAGMA
#endif // USE_TARGET_OFFLOAD
  cms_gpu(cid->am[0], cid->c1);
  }
#else // NO_USE_TARGET_OFFLOAD
  cms(cid->am[0], cid->c1);
  message = "c_cpu_cms_jxi_" + to_string(jxi488) + ".ppm";
  write_matrix_as_ppm(cid->am[0], ndit, ndit, message, "MAG", 1, 1);
  {
    VirtualAsciiFile *outam1 = new VirtualAsciiFile();
    string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
    sprintf(virtual_line, " AM matrix after CMS before LUCIN\n");
    outam1->append_line(virtual_line);
    sprintf(virtual_line, " %d\n", ndit);
    outam1->append_line(virtual_line);  
    sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
    outam1->append_line(virtual_line);
    write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1);
    outam1->write_to_disk(outam1_name);
    delete outam1;
  }
#ifdef DEBUG_AM
  VirtualAsciiFile *outam1 = new VirtualAsciiFile();
  string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
@@ -947,6 +1003,7 @@ int cluster_jxi488_cycle(
    return jer;
    // break; // jxi488 loop: goes to memory clean
  }
#endif // USE_TARGET_OFFLOAD
  interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
  nvtxRangePush("Average calculation");
+4 −4
Original line number Diff line number Diff line
@@ -104,10 +104,10 @@ double cgev(int ipamo, int mu, int l, int m);
 * This function constructs the multi-centered M-matrix of the cluster, according
 * to Eq. (5.28) of Borghese, Denti & Saija (2007).
 *
 * \param am: `complex double **`
 * \param c1: `ParticleDescriptor *`
 * \param[in,out] am: `complex double *`
 * \param[in] c1: `ParticleDescriptor *`
 */
void cms(dcomplex **am, ParticleDescriptor *c1);
void cms(dcomplex *am, ParticleDescriptor *c1);

#ifdef USE_TARGET_OFFLOAD
/**
@@ -119,7 +119,7 @@ void cms(dcomplex **am, ParticleDescriptor *c1);
 * \param am: `complex double **`
 * \param c1: `ParticleDescriptor *`
 */
void cms_gpu(dcomplex *am, ParticleDescriptor *c1);
void cms_flat(dcomplex *am, ParticleDescriptor *c1);
#endif // USE_TARGET_OFFLOAD

/**
+19 −0
Original line number Diff line number Diff line
@@ -105,11 +105,30 @@ template<int IHI> magmaDoubleComplex magma_ghit(
void magma_r3jjr(int j2, int j3, int m2, int m3, double rac3j[]);
#pragma omp end declare target

/**
 * \brief Invert a pre-existing GPU complex matrix with double precision elements.
 *
 * \param[in,out] mat: Matrix of complex. The matrix to be inverted.
 * \param[in] n: `magma_int_t` The number of rows and columns of the [n x n] matrix.
 * \param[out] jer: `int &` Reference to an integer return flag.
 * \param[in] queue: `magma_queue_t` The MAGMA device communication queue.
 * \param[in] device_id: `int` ID of the device for matrix inversion offloading (optional, default 0).
 * \param[in] rs: `const RuntimeSettings &` Runtime settings instance (optional).
 */
void magma_zinvert_resident(
  magmaDoubleComplex *mat, magma_int_t n, int &jer, magma_queue_t queue, int device_id=0,
  const RuntimeSettings& rs=RuntimeSettings()
);
#endif // USE_TARGET_OFFLOAD

/**
 * \brief Invert a complex matrix with double precision elements.
 *
 * This function is a first implementation of GPU use. The data transfer
 * steps are handled internally, via standard MAGMA calls, so that the matrix
 * to be inverted is copied from the host memory to the GPU, inverted, possibly
 * refined and finally taken back to the host, leaving the GPU free.
 *
 * \param[in,out] mat: Matrix of complex. The matrix to be inverted.
 * \param[in] n: `np_int` The number of rows and columns of the [n x n] matrix.
 * \param[out] jer: `int &` Reference to an integer return flag.
+9 −9
Original line number Diff line number Diff line
@@ -56,19 +56,19 @@ int write_dcomplex_matrix(
/**
 * \brief Draw a PPM representation of a matrix.
 *
 * \param A: `const dcomplex *` Vector representation of the matrix. [IN]
 * \param m: `int` Number of rows in the matrix. [IN]
 * \param n: `int` Number of columns in the matrix. [IN]
 * \param file_name: `const string&` Reference to a string with the name of the output file. [IN]
 * \param mode: `const string&` Reference to a string for the value to be represented ("MAG" to
 * \param[in] A: `const dcomplex *` Vector representation of the matrix.
 * \param[in] m: `int64_t` Number of rows in the matrix.
 * \param[in] n: `int64_t` Number of columns in the matrix.
 * \param[in] file_name: `const string&` Reference to a string with the name of the output file.
 * \param[in] mode: `const string&` Reference to a string for the value to be represented ("MAG" to
 * draw magnitudes, "RE" to draw real parts, "IM" to draw imaginary parts. Optional, default is
 * "MAG"). [IN]
 * \param row_bin: `int` Number of rows to bin together (optional, default is 1). [IN]
 * \param col_bin: `int` Number of columns to bin together (optional, default is 1). [IN]
 * "MAG").
 * \param[in] row_bin: `int` Number of rows to bin together (optional, default is 1).
 * \param[in] col_bin: `int` Number of columns to bin together (optional, default is 1).
 * \return result: `int` An exit code (0 if successful).
 */
int write_matrix_as_ppm(
  const dcomplex *A, int m, int n, const std::string& file_name, const std::string& mode="MAG",
  dcomplex *A, int64_t m, int64_t n, const std::string& file_name, const std::string& mode="MAG",
  int row_bin=1, int col_bin=1
);
#endif
+23 −21
Original line number Diff line number Diff line
@@ -457,13 +457,14 @@ double cgev(int ipamo, int mu, int l, int m) {
}
*/

void cms(dcomplex **am, ParticleDescriptor *c1) {
void cms(dcomplex *am, ParticleDescriptor *c1) {
  const int nsph = c1->nsph;
  const int nlim = c1->nlim;
  const int li = c1->li;
  const int ndi = nsph * nlim;
  const int ndit = ndi + ndi;
  const int nsphmo = nsph - 1;
  double *rac3j = c1->rac3j;
  const int lmtpo = c1->lmtpo;
  
  // int nbl = 0;
  for (int n1 = 1; n1 <= nsphmo; n1++) {
@@ -492,6 +493,7 @@ void cms(dcomplex **am, ParticleDescriptor *c1) {
	    double rsh = ((l2 + l1) % 2 == 0) ? 1.0 : -1.0;
	    double rsk = -rsh;
	    for (int im2 = 1; im2 <= l2tpo; im2++) {
	      double rac3j[lmtpo];
	      int m2 = im2 - l2po;
	      int ilm2 = il2 + m2;
	      int ilm2e = ilm2 + ndi;
@@ -501,14 +503,14 @@ void cms(dcomplex **am, ParticleDescriptor *c1) {
	      int j2e = in1 + ilm2e;
	      dcomplex cgh = ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j);
	      dcomplex cgk = ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j);
	      am[i1 - 1][i2 - 1] = cgh;
	      am[i1 - 1][i2e - 1] = cgk;
	      am[i1e - 1][i2 - 1] = cgk;
	      am[i1e - 1][i2e - 1] = cgh;
	      am[j1 - 1][j2 - 1] = cgh * rsh;
	      am[j1 - 1][j2e - 1] = cgk * rsk;
	      am[j1e - 1][j2 - 1] = cgk * rsk;
	      am[j1e - 1][j2e - 1] = cgh * rsh;
	      am[(i1 - 1) * ndit + i2 - 1] = cgh;
	      am[(i1 - 1) * ndit + i2e - 1] = cgk;
	      am[(i1e - 1) * ndit + i2 - 1] = cgk;
	      am[(i1e - 1) * ndit + i2e - 1] = cgh;
	      am[(j1 - 1) * ndit + j2 - 1] = cgh * rsh;
	      am[(j1 - 1) * ndit + j2e - 1] = cgk * rsk;
	      am[(j1e - 1) * ndit + j2 - 1] = cgk * rsk;
	      am[(j1e - 1) * ndit + j2e - 1] = cgh * rsh;
	    } // im2 loop
	  } // l2 loop
	} // im1 loop
@@ -516,7 +518,7 @@ void cms(dcomplex **am, ParticleDescriptor *c1) {
    } // n2 loop
  } // n1 loop
#pragma omp parallel for collapse(2)
  for (int n1 = 1; n1 <= nsph; n1++) { // GPU portable?
  for (int n1 = 1; n1 <= nsph; n1++) {
    for (int l1 = 1; l1 <= li; l1++) {
      int in1 = (n1 - 1) * nlim;
      dcomplex dm = c1->rmi[l1 - 1][n1 - 1];
@@ -533,20 +535,20 @@ void cms(dcomplex **am, ParticleDescriptor *c1) {
	// for (int ilm2 = 1; ilm2 <= c1->nlim; ilm2++) {
	//   int i2 = in1 + ilm2;
	//   int i2e = i2 + ndi;
	//   am[i1 - 1][i2 - 1] = cc0;
	//   am[i1 - 1][i2e - 1] = cc0;
	//   am[i1e - 1][i2 - 1] = cc0;
	//   am[i1e - 1][i2e - 1] = cc0;
	//   am[(i1 - 1) * ndit + i2 - 1] = cc0;
	//   am[(i1 - 1) * ndit + i2e - 1] = cc0;
	//   am[(i1e - 1) * ndit + i2 - 1] = cc0;
	//   am[(i1e - 1) * ndit + i2e - 1] = cc0;
	// }
	am[i1 - 1][i1 - 1] = dm;
	am[i1e - 1][i1e - 1] = de;
	am[(i1 - 1) * ndit + i1 - 1] = dm;
	am[(i1e - 1) * ndit + i1e - 1] = de;
      } // im1 loop
    } // l1 loop
  } // n1 loop
}

#ifdef USE_TARGET_OFFLOAD
void cms_gpu(dcomplex *am, ParticleDescriptor *c1) {
void cms_flat(dcomplex *am, ParticleDescriptor *c1) {
  const int nsph = c1->nsph;
  const int nlim = c1->nlim;
  const int li = c1->li;
Loading