Commit 6d81e749 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Revert to non parallel version of scr2()

parent 124f1f6a
Loading
Loading
Loading
Loading
+40 −146
Original line number Original line Diff line number Diff line
@@ -2288,9 +2288,6 @@ void scr2(
	  double vk, double vkarg, double exri, double *duk,
	  double vk, double vkarg, double exri, double *duk,
	  ParticleDescriptor *c1
	  ParticleDescriptor *c1
) {
) {
#ifdef USE_NVTX
  nvtxRangePush("scr2 starts");
#endif
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  dcomplex s11, s21, s12, s22, rm, re, csam, cph, phas, cc;
  dcomplex s11, s21, s12, s22, rm, re, csam, cph, phas, cc;
@@ -2300,177 +2297,74 @@ void scr2(
  double cfsq = 4.0 / (pi4sq * ccs * ccs);
  double cfsq = 4.0 / (pi4sq * ccs * ccs);
  cph = uim * exri * vkarg;
  cph = uim * exri * vkarg;
  int ls = (c1->li < c1->le) ? c1->li : c1->le;
  int ls = (c1->li < c1->le) ? c1->li : c1->le;
  int kmax = ls*(ls+2);
  c1->tsas[0][0] = cc0;
  dcomplex *vec_rmi = c1->rmi[0];
  c1->tsas[1][0] = cc0;
  dcomplex *vec_rei = c1->rei[0];
  c1->tsas[0][1] = cc0;
  dcomplex *vec_w = c1->w[0];
  c1->tsas[1][1] = cc0;
  dcomplex *vec_sas = c1->sas[0][0];
#ifdef USE_NVTX
  nvtxRangePush("scr2 outer loop 1");
#endif
#pragma omp parallel for
  for (int i14 = 1; i14 <= c1->nsph; i14++) {
  for (int i14 = 1; i14 <= c1->nsph; i14++) {
    int i = i14 - 1;
    int i = i14 - 1;
    int iogi = c1->iog[i14 - 1];
    int iogi = c1->iog[i14 - 1];
    if (iogi >= i14) {
    if (iogi >= i14) {
      // int k = 0;
      int k = 0;
      dcomplex s11 = cc0;
      s11 = cc0;
      dcomplex s21 = cc0;
      s21 = cc0;
      dcomplex s12 = cc0;
      s12 = cc0;
      dcomplex s22 = cc0;
      s22 = cc0;
      // To parallelise, I run a linearised loop directly over k
      for (int l10 = 1; l10 <= ls; l10++) {
      // working out the algebra, it turns out that
      // k = l10*l10-1+im10
      // we invert this to find
      // l10 = (int) sqrt(k+1) and im10 = k - l10*10+1
      // but if it results im10 = 0, then we set l10 = l10-1 and im10 = 2*l10+1
      // furthermore if it results im10 > 2*l10+1, then we set
      // im10 = im10 -(2*l10+1) and l10 = l10+1 (there was a rounding error in a nearly exact root)
#ifdef USE_NVTX
      nvtxRangePush("scr2 inner loop 1");
#endif
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
// #else
// #pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
// #endif
#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
      for (int k = 1; k<=kmax; k++) {
	int l10 = (int) sqrt(k+1);
	int im10 = k - (l10*l10) + 1;
	if (im10 == 0) {
	  l10--;
	  im10 = 2*l10+1;
	}
	else if (im10 > 2*l10 + 1) {
	  im10 -= 2*l10 + 1;
	  l10++;
	}
	// I have all the indices in my linearised loop
	int l = l10 - 1;
	int l = l10 - 1;
	rm = 1.0 / c1->rmi[l][i];
	re = 1.0 / c1->rei[l][i];
	int ltpo = l10 + l10 + 1;
	for (int im10 = 1; im10 <= ltpo; im10++) {
	  k++;
	  int ke = k + c1->nlem;
	  int ke = k + c1->nlem;
	int km1t4 = (k - 1)*4;
	  s11 -= (c1->w[k - 1][2] * c1->w[k - 1][0] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][0] * re);
	int kem1t4 = (ke - 1)*4;
	  s21 -= (c1->w[k - 1][3] * c1->w[k - 1][0] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][0] * re);
	dcomplex auxrm0 = vec_w[km1t4] / vec_rmi[l*c1->nsph+i];
	  s12 -= (c1->w[k - 1][2] * c1->w[k - 1][1] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][1] * re);
	dcomplex auxrm1 = vec_w[km1t4+1] / vec_rmi[l*c1->nsph+i];
	  s22 -= (c1->w[k - 1][3] * c1->w[k - 1][1] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][1] * re);
	dcomplex auxre0 = vec_w[kem1t4] / vec_rei[l*c1->nsph+i];
	} // im10 loop
	dcomplex auxre1 = vec_w[kem1t4+1] / vec_rei[l*c1->nsph+i];
      } // l10 loop
	s11 -= vec_w[km1t4+2] * auxrm0 + vec_w[kem1t4+2] * auxre0;
      c1->sas[i][0][0] = s11 * csam;
	s21 -= vec_w[km1t4+3] * auxrm0 + vec_w[kem1t4+3] * auxre0;
      c1->sas[i][1][0] = s21 * csam;
	s12 -= vec_w[km1t4+2] * auxrm1 + vec_w[kem1t4+2] * auxre1;
      c1->sas[i][0][1] = s12 * csam;
	s22 -= vec_w[km1t4+3] * auxrm1 + vec_w[kem1t4+3] * auxre1;
      c1->sas[i][1][1] = s22 * csam;
      }
#ifdef USE_NVTX
      nvtxRangePop();
#endif
      int vecindex = i*4;
      vec_sas[vecindex] = s11 * csam;
      vec_sas[vecindex+2] = s21 * csam;
      vec_sas[vecindex+1] = s12 * csam;
      vec_sas[vecindex+3] = s22 * csam;
    }
    }
    // label 12
    // label 12
    // dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i]));
    phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i]));
    // tsas00 += (c1->sas[iogi - 1][0][0] * phas);
    c1->tsas[0][0] += (c1->sas[iogi - 1][0][0] * phas);
    // tsas10 += (c1->sas[iogi - 1][1][0] * phas);
    c1->tsas[1][0] += (c1->sas[iogi - 1][1][0] * phas);
    // tsas01 += (c1->sas[iogi - 1][0][1] * phas);
    c1->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas);
    // tsas11 += (c1->sas[iogi - 1][1][1] * phas);
    c1->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas);
  } // i14 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  dcomplex tsas00 = cc0;
  dcomplex tsas10 = cc0;
  dcomplex tsas01 = cc0;
  dcomplex tsas11 = cc0;
#ifdef USE_NVTX
  nvtxRangePush("scr2 loop 2");
#endif
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
// #else
// #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
// #endif
#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
  for (int i14 = 1; i14 <= c1->nsph; i14++) {
    int i = i14 - 1;
    int iogi = c1->iog[i14 - 1];
    // label 12
    dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i]));
    tsas00 += (c1->sas[iogi - 1][0][0] * phas);
    tsas10 += (c1->sas[iogi - 1][1][0] * phas);
    tsas01 += (c1->sas[iogi - 1][0][1] * phas);
    tsas11 += (c1->sas[iogi - 1][1][1] * phas);
  } // i14 loop
  } // i14 loop
  c1->tsas[0][0] = tsas00;
  c1->tsas[1][0] = tsas10;
  c1->tsas[0][1] = tsas01;
  c1->tsas[1][1] = tsas11;
#ifdef USE_NVTX
  nvtxRangePop();
  //#endif
  //dcomplex *vec_vints = c1->vints[0];
  //#ifdef USE_NVTX
  nvtxRangePush("scr2 outer loop 3");
#endif
#pragma omp parallel for
  for (int i24 = 1; i24 <= c1->nsph; i24++) {
  for (int i24 = 1; i24 <= c1->nsph; i24++) {
    int iogi = c1->iog[i24 - 1];
    int iogi = c1->iog[i24 - 1];
    if (iogi >= i24) {
    if (iogi >= i24) {
      // int j = 0;
      int j = 0;
#ifdef USE_NVTX
      nvtxRangePush("scr2 inner loop 3");
#endif
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for simd collapse(4)
// #else
// #pragma omp parallel for simd collapse(4)
// #endif
#pragma omp parallel for simd collapse(4)
      for (int ipo1 = 1; ipo1 <=2; ipo1++) {
      for (int ipo1 = 1; ipo1 <=2; ipo1++) {
	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
	  cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]);
	  for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	  for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	    for (int jpo2 = 1; jpo2 <= 2; jpo2++) {
	    for (int jpo2 = 1; jpo2 <= 2; jpo2++) {
	      int j = jpo2 - 1 + (ipo2 - 1) * 2 + (jpo1 - 1) * 4 + (ipo1 - 1) * 8;
	      j++;
	      c1->vints[i24 - 1][j] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq;
	      c1->vints[i24 - 1][j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * cc * cfsq;
	    } // jpo2 loop
	    } // jpo2 loop
	  } // ipo2 loop
	  } // ipo2 loop
	} // jpo1 loop
	} // jpo1 loop
      } // ipo1 loop
      } // ipo1 loop
#ifdef USE_NVTX
      nvtxRangePop();
#endif
    }
    }
  } // i24 loop
  } // i24 loop
#ifdef USE_NVTX
  int j = 0;
  nvtxRangePop();
#endif
  // int j = 0;
#ifdef USE_NVTX
  nvtxRangePush("scr2 loop 4");
#endif
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for collapse(4)
// #else
// #pragma omp parallel for collapse(4)
// #endif
#pragma omp parallel for collapse(4)
  for (int ipo1 = 1; ipo1 <=2; ipo1++) {
  for (int ipo1 = 1; ipo1 <=2; ipo1++) {
    for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
    for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
      cc = dconjg(c1->tsas[jpo1 - 1][ipo1 - 1]);
      for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
      for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	for (int jpo2 = 1; jpo2 <= 2; jpo2++) {
	for (int jpo2 = 1; jpo2 <= 2; jpo2++) {
	  int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8;
	  j++;
	  c1->vintt[j] = c1->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c1->tsas[jpo1 - 1][ipo1 - 1]) * cfsq;
	  c1->vintt[j - 1] = c1->tsas[jpo2 - 1][ipo2 - 1] * cc * cfsq;
	} // jpo2 loop
	} // jpo2 loop
      } // ipo2 loop
      } // ipo2 loop
    } // jpo1 loop
    } // jpo1 loop
  } // ipo1 loop
  } // ipo1 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
#ifdef USE_NVTX
  nvtxRangePop();
#endif
}
}


void str(double **rcf, ParticleDescriptor *c1) {
void str(double **rcf, ParticleDescriptor *c1) {