Commit 8a4d015b authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

apply scr2() logic to ztm() loops vastly improving parallelisation

parent 68a9a01c
Loading
Loading
Loading
Loading
+50 −45
Original line number Diff line number Diff line
@@ -2259,10 +2259,6 @@ void scr2(
      vec_sas[vecindex+2] = s21 * csam;
      vec_sas[vecindex+1] = s12 * csam;
      vec_sas[vecindex+3] = s22 * csam;
      // c1->sas[i][0][0] = s11 * csam;
      // c1->sas[i][1][0] = s21 * csam;
      // c1->sas[i][0][1] = s12 * csam;
      // c1->sas[i][1][1] = s22 * csam;
    }
    // label 12
    dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i]));
@@ -2270,10 +2266,6 @@ void scr2(
    tsas10 += (c1->sas[iogi - 1][1][0] * phas);
    tsas01 += (c1->sas[iogi - 1][0][1] * phas);
    tsas11 += (c1->sas[iogi - 1][1][1] * phas);
    // c3->tsas[0][0] += (c1->sas[iogi - 1][0][0] * phas);
    // c3->tsas[1][0] += (c1->sas[iogi - 1][1][0] * phas);
    // c3->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas);
    // c3->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas);
  } // i14 loop
#ifdef USE_NVTX
  nvtxRangePop();
@@ -2297,11 +2289,9 @@ void scr2(
#pragma omp target teams distribute parallel for simd collapse(4)
      for (int ipo1 = 1; ipo1 <=2; ipo1++) {
	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
	  // cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]);
	  for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	    for (int jpo2 = 1; jpo2 <= 2; jpo2++) {
	      int j = jpo2 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8;
	      // j++;
	      vec_vints[(i24 - 1)*16+j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq;
	    } // jpo2 loop
	  } // ipo2 loop
@@ -2322,10 +2312,8 @@ void scr2(
#pragma omp target parallel for collapse(4)
  for (int ipo1 = 1; ipo1 <=2; ipo1++) {
    for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
      // cc = dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]);
      for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	for (int jpo2 = 1; jpo2 <= 2; jpo2++) {
	  // j++;
	  int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8;
	  c1ao->vintt[j] = c3->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]) * cfsq;
	} // jpo2 loop
@@ -2443,38 +2431,54 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) {
  // C9 *c9_para = new C9(*c9);
  dcomplex *gis_v = c9->gis[0];
  dcomplex *gls_v = c9->gls[0];
#pragma omp target parallel
  {
  double *rac3j_local = (double *) malloc(c6->lmtpo*sizeof(double));
#pragma omp for collapse(3)
  int k2max = c4->li*(c4->li+2);
  int k3max = c4->le*(c4->le+2);
  // To parallelise, I run a linearised loop directly over k
  // working out the algebra, it turns out that
  // k = l*l-1+im
  // we invert this to find
  // l = (int) sqrt(k+1) and im = k - l*l+1
  // but if it results im = 0, then we set l = l-1 and im = 2*l+1
  // furthermore if it results im > 2*l+1, then we set
  // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
# pragma omp target teams distribute parallel for simd collapse(3)
  for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable?
      for (int l2 = 1; l2 <= c4->li; l2++) {
	for (int l3 = 1; l3 <= c4->le; l3++) {
	  int l2tpo = l2 + l2 + 1;
	  int l3tpo = l3 + l3 + 1;
	  for (int im2 = 1; im2 <= l2tpo; im2++) {
    for (int k2 = 1; k2<=k2max; k2++) {
      for (int k3 = 1; k3<=k3max; k3++) {
	int l2 = (int) sqrt(k2+1);
	int im2 = k2 - (l2*l2) + 1;
	if (im2 == 0) {
	  l2--;
	  im2 = 2*l2+1;
	}
	else if (im2 > 2*l2 + 1) {
	  im2 -= 2*l2 + 1;
	  l2++;
	}
	int l3 = (int) sqrt(k3+1);
	int im3 = k3 - (l3*l3) + 1;
	if (im3 == 0) {
	  l3--;
	  im3 = 2*l3+1;
	}
	else if (im3 > 2*l3 + 1) {
	  im3 -= 2*l3 + 1;
	  l3++;
	}
	// int l2tpo = l2 + l2 + 1;
	// int l3tpo = l3 + l3 + 1;
	int i2 = (n2-1) * c4->li * (c4->li + 2) + l2 * l2 + im2 - 1;
	    //	  int vecindex0 = (i2 - 1)*c9->nlem;
	int m2 = -l2 - 1 + im2;
	    // i2++; // old implementation
	    // int i3 = 0; // old implementation
	    //#pragma omp for
	    for (int im3 = 1; im3 <= l3tpo; im3++) {
	int i3 = l3 * l3 + im3 - 1;
	int m3 = -l3 - 1 + im3;
	int vecindex = (i2 - 1)*c9->nlem + i3 - 1;
	      // i3++; // old implementation
	gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local);
	gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local);
	      // c9->gis[i2 - 1][i3 - 1] = ghit(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local);
	      // c9->gls[i2 - 1][i3 - 1] = ghit(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local);
	    } // im3 loop
	  } // l3 loop
	} // im2 loop
      } // l2 loop
    } // n2 loop
      } // close k3 loop, former l3 + im3 loops
    } // close k2 loop, former l2 + im2 loops
  } // close n2 loop
  free(rac3j_local);
  } // pragma omp parallel
#ifdef USE_NVTX
  nvtxRangePop();
#endif
@@ -2483,7 +2487,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) {
#endif
  dcomplex *am_v = am[0];
  dcomplex *sam_v = c9->sam[0];
# pragma omp target teams distribute parallel for collapse(2)
# pragma omp target teams distribute parallel for simd collapse(2)
  for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable?
    for (int i3 = 1; i3 <= c4->nlem; i3++) {
      dcomplex sum1 = cc0;
@@ -2492,6 +2496,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) {
      dcomplex sum4 = cc0;
      int i1e = i1 + ndi;
      int i3e = i3 + c4->nlem;
#pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4)
      for (int i2 = 1; i2 <= ndi; i2++) {
	int i2e = i2 + ndi;
	int vecindg_23 = (i2 - 1)*c9->nlem + i3 - 1;