Commit d587e3df authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Prepare CLUSTER to use offload switch

parent 3445f26d
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -918,8 +918,12 @@ int cluster_jxi488_cycle(
  write_dcomplex_matrix(outam0, cid->am, ndit, ndit);
  outam0->write_to_disk(outam0_name);
  delete outam0;
#endif
#endif // DEBUG_AM
#ifdef USE_OFFLOAD
  cms_gpu(cid->am, cid->c1);
#else
  cms(cid->am, cid->c1);
#endif // USE_OFFLOAD
#ifdef DEBUG_AM
  VirtualAsciiFile *outam1 = new VirtualAsciiFile();
  string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
+12 −0
Original line number Diff line number Diff line
@@ -93,6 +93,7 @@ dcomplex cdtp(dcomplex z, dcomplex *vec_am, int i, int jf, int k, int nj, np_int
 */
double cgev(int ipamo, int mu, int l, int m);

#ifndef USE_OFFLOAD
/*! \brief Build the multi-centered M-matrix of the cluster.
 *
 * This function constructs the multi-centered M-matrix of the cluster, according
@@ -102,6 +103,17 @@ double cgev(int ipamo, int mu, int l, int m);
 * \param c1: `ParticleDescriptor *`
 */
void cms(dcomplex **am, ParticleDescriptor *c1);
#else
/*! \brief Build the multi-centered M-matrix of the cluster.
 *
 * This function constructs the multi-centered M-matrix of the cluster, according
 * to Eq. (5.28) of Borghese, Denti & Saija (2007).
 *
 * \param am: `complex double **`
 * \param c1: `ParticleDescriptor *`
 */
void cms_gpu(dcomplex **am, ParticleDescriptor *c1);
#endif // USE_OFFLOAD

/*! \brief Compute orientation-averaged scattered field intensity.
 *
+94 −10
Original line number Diff line number Diff line
@@ -436,6 +436,7 @@ double cgev(int ipamo, int mu, int l, int m) {
// #pragma omp end declare target
// #endif

#ifndef USE_OFFLOAD
void cms(dcomplex **am, ParticleDescriptor *c1) {
  const dcomplex cc0 = 0.0 + 0.0 * I;
  int ndi = c1->nsph * c1->nlim;
@@ -484,8 +485,8 @@ void cms(dcomplex **am, ParticleDescriptor *c1) {
	      am[j1 - 1][j2e - 1] = cgk * (1.0 * isk);
	      am[j1e - 1][j2 - 1] = cgk * (1.0 * isk);
	      am[j1e - 1][j2e - 1] = cgh * (1.0 * ish);
	    }
	  }
	    } // im2 loop
	  } // l2 loop
	} // im1 loop
      } // l1 loop
    } // n2 loop
@@ -504,20 +505,103 @@ void cms(dcomplex **am, ParticleDescriptor *c1) {
	int ilm1 = il1 + m1;
	int i1 = in1 + ilm1;
	int i1e = i1 + ndi;
	for (int ilm2 = 1; ilm2 <= c1->nlim; ilm2++) {
	  int i2 = in1 + ilm2;
	  int i2e = i2 + ndi;
	  am[i1 - 1][i2 - 1] = cc0;
	  am[i1 - 1][i2e - 1] = cc0;
	  am[i1e - 1][i2 - 1] = cc0;
	  am[i1e - 1][i2e - 1] = cc0;
	}
	// The following 0-initializations are overkill, since am is initialized at 0.
	// for (int ilm2 = 1; ilm2 <= c1->nlim; ilm2++) {
	//   int i2 = in1 + ilm2;
	//   int i2e = i2 + ndi;
	//   am[i1 - 1][i2 - 1] = cc0;
	//   am[i1 - 1][i2e - 1] = cc0;
	//   am[i1e - 1][i2 - 1] = cc0;
	//   am[i1e - 1][i2e - 1] = cc0;
	// }
	am[i1 - 1][i1 - 1] = dm;
	am[i1e - 1][i1e - 1] = de;
      } // im1 loop
    } // l1 loop
  } // n1 loop
}
#else
void cms_gpu(dcomplex **am, ParticleDescriptor *c1) {
  const dcomplex cc0 = 0.0 + 0.0 * I;
  int ndi = c1->nsph * c1->nlim;
  // int nbl = 0;
  int nsphmo = c1->nsph - 1;
  for (int n1 = 1; n1 <= nsphmo; n1++) { // GPU portable?
    int in1 = (n1 - 1) * c1->nlim;
    int n1po = n1 + 1;
    for (int n2 = n1po; n2 <= c1->nsph; n2++) {
      int in2 = (n2 - 1) * c1->nlim;
      // nbl++;
      int nbl = ((n1 - 1) * (2 * c1->nsph - n1)) / 2 + n2 - n1;
      for (int l1 = 1; l1 <= c1->li; l1++) {
	int l1po = l1 + 1;
	int il1 = l1po * l1;
	int l1tpo = l1po + l1;
	for (int im1 = 1; im1 <= l1tpo; im1++) {
	  int m1 = im1 - l1po;
	  int ilm1 = il1 + m1;
	  int ilm1e = ilm1 + ndi;
	  int i1 = in1 + ilm1;
	  int i1e = in1 + ilm1e;
	  int j1 = in2 + ilm1;
	  int j1e = in2 + ilm1e;
	  for (int l2 = 1; l2 <= c1->li; l2++) {
	    int l2po = l2 + 1;
	    int il2 = l2po * l2;
	    int l2tpo = l2po + l2;
	    int ish = ((l2 + l1) % 2 == 0) ? 1 : -1;
	    int isk = -ish;
	    for (int im2 = 1; im2 <= l2tpo; im2++) {
	      int m2 = im2 - l2po;
	      int ilm2 = il2 + m2;
	      int ilm2e = ilm2 + ndi;
	      int i2 = in2 + ilm2;
	      int i2e = in2 + ilm2e;
	      int j2 = in1 + ilm2;
	      int j2e = in1 + ilm2e;
	      dcomplex cgh = ghit(0, 0, nbl, l1, m1, l2, m2, c1);
	      dcomplex cgk = ghit(0, 1, nbl, l1, m1, l2, m2, c1);
	      am[i1 - 1][i2 - 1] = cgh;
	      am[i1 - 1][i2e - 1] = cgk;
	      am[i1e - 1][i2 - 1] = cgk;
	      am[i1e - 1][i2e - 1] = cgh;
	      am[j1 - 1][j2 - 1] = cgh * (1.0 * ish);
	      am[j1 - 1][j2e - 1] = cgk * (1.0 * isk);
	      am[j1e - 1][j2 - 1] = cgk * (1.0 * isk);
	      am[j1e - 1][j2e - 1] = cgh * (1.0 * ish);
	    } // im2 loop
	  } // l2 loop
	} // im1 loop
      } // l1 loop
    } // n2 loop
  } // n1 loop

  int max_l1tpo = 2 * c1->li + 1;
  np_int diag_iters = c1->li * max_l1tpo * c1->nsph;
#pragma omp parallel for collapse(3)
  for (int n1 = 1; n1 <= c1->nsph; n1++) {
    for (int l1 = 1; l1 <= c1->li; l1++) { // GPU portable
      for (int im1 = 1; im1 <= max_l1tpo; im1++) {
	dcomplex scratch_e, scratch_m;
	int in1 = (n1 - 1) * c1->nlim;
	dcomplex dm = c1->rmi[l1 - 1][n1 - 1];
	dcomplex de = c1->rei[l1 - 1][n1 - 1];
	int l1po = l1 + 1;
	int il1 = l1po * l1;
	int l1tpo = l1po + l1;
	int m1 = (im1 <= l1tpo) ? im1 - l1po : 0;
	int ilm1 = (im1 <= l1tpo) ? il1 + m1 : 0;
	int i1 = (im1 <= l1tpo) ? in1 + ilm1 : 0;
	int i1e = (im1 <= l1tpo) ? i1 + ndi : 0;
	dcomplex *target_m = (im1 <= l1tpo) ? (am[i1 - 1] + i1 - 1) : &scratch_m;
	dcomplex *target_e = (im1 <= l1tpo) ? (am[i1e - 1] + i1e - 1) : &scratch_e;
	*target_m = dm;
	*target_e = de;
      } // im1 loop
    } // l1 loop
  } // n1 loop
}
#endif // USE_OFFLOAD

void crsm1(double vk, double exri, ParticleDescriptor *c1) {
  dcomplex ***svf, ***svw, **svs;