Commit 95f8fcf8 authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

parallelisation in principle done, working but not very effective, profiling

needed
parent 8a4d015b
Loading
Loading
Loading
Loading
+63 −34
Original line number Diff line number Diff line
@@ -1293,6 +1293,9 @@ void mextc(double vk, double exri, dcomplex **fsac, double **cextlr, double **ce
}

void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
#ifdef USE_NVTX
  nvtxRangePush("whole pcros");
#endif
  const dcomplex cc0 = 0.0 + 0.0 * I;
  dcomplex sump, sum1, sum2, sum3, sum4, am, amp, cc, csam;
  const double exdc = exri * exri;
@@ -1303,8 +1306,13 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
  double cfsq = 4.0 / (pi4sq *ccs * ccs);
  const int nlemt = c4->nlem + c4->nlem;
  int jpo = 2;
  for (int ipo18 = 1; ipo18 <= 2; ipo18++) {
    if (ipo18 == 2) jpo = 1;
  dcomplex *vec_am0m = c1ao->am0m[0];
  dcomplex *vec_w = c1->w[0];
#ifdef USE_NVTX
  nvtxRangePush("pcros outer loop 1");
#endif
  for (int ipo18 = 0; ipo18 < 2; ipo18++) {
    int jpo = 1-ipo18;
    int ipopt = ipo18 + 2;
    int jpopt = jpo + 2;
    double sum = 0.0;
@@ -1313,32 +1321,44 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
    sum2 = cc0;
    sum3 = cc0;
    sum4 = cc0;
    for (int i12 = 1; i12 <= nlemt; i12++) {
      int i = i12 - 1;
      am = cc0;
      amp = cc0;
      for (int j10 = 1; j10 <= nlemt; j10++) {
	int j = j10 - 1;
	am += (c1ao->am0m[i][j] * c1->w[j][ipo18 - 1]);
	amp += (c1ao->am0m[i][j] * c1->w[j][jpo - 1]);
#ifdef USE_NVTX
  nvtxRangePush("pcros intermediate loop 1");
#endif
#pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
    for (int i12 = 0; i12 < nlemt; i12++) {
      // int i = i12 - 1;
      dcomplex am = cc0;
      dcomplex amp = cc0;
      //#pragma omp target teams distribute parallel for simd reduction(+:am,amp)
      for (int j10 = 0; j10 < nlemt; j10++) {
	// int j = j10 - 1;
	am += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+ipo18]);
	amp += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+jpo]);
      } // j10 loop
      sum += real(dconjg(am) * am);
      sump += (dconjg(amp) * am);
      sum1 += (dconjg(c1->w[i][ipo18 - 1]) * am);
      sum2 += (dconjg(c1->w[i][jpo - 1]) * am);
      sum3 += (c1->w[i][ipopt - 1] * am);
      sum4 += (c1->w[i][jpopt - 1] * am);
      sum1 += (dconjg(vec_w[4*i12+ipo18]) * am);
      sum2 += (dconjg(vec_w[4*i12+jpo]) * am);
      sum3 += (vec_w[4*i12+ipopt] * am);
      sum4 += (vec_w[4*i12+jpopt] * am);
    } // i12 loop
    c1ao->scsc[ipo18 - 1] = cccs * sum;
    c1ao->scscp[ipo18 - 1] = cccs * sump;
    c1ao->ecsc[ipo18 - 1] = -cccs * real(sum1);
    c1ao->ecscp[ipo18 - 1] = -cccs * sum2;
    c1ao->fsac[ipo18 - 1][ipo18 - 1] = csam * sum1;
    c1ao->fsac[jpo - 1][ipo18 - 1] = csam * sum2;
    c1ao->sac[ipo18 - 1][ipo18 - 1] = csam * sum3;
    c1ao->sac[jpo - 1][ipo18 - 1] = csam * sum4;
#ifdef USE_NVTX
  nvtxRangePop();
#endif
    c1ao->scsc[ipo18] = cccs * sum;
    c1ao->scscp[ipo18] = cccs * sump;
    c1ao->ecsc[ipo18] = -cccs * real(sum1);
    c1ao->ecscp[ipo18] = -cccs * sum2;
    c1ao->fsac[ipo18][ipo18] = csam * sum1;
    c1ao->fsac[jpo][ipo18] = csam * sum2;
    c1ao->sac[ipo18][ipo18] = csam * sum3;
    c1ao->sac[jpo][ipo18] = csam * sum4;
  } // ipo18 loop
  int i = 0;
  dcomplex * &vint =  c1->vint;
#ifdef USE_NVTX
  nvtxRangePush("pcros loop 2");
#endif
  for (int ipo1 = 1; ipo1 <= 2; ipo1++) {
    for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
      cc = dconjg(c1ao->sac[jpo1 - 1][ipo1 - 1]);
@@ -1349,38 +1369,47 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
      } // ipo2 loop
    } // jpo1 loop
  } // ipo1 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
#ifdef USE_NVTX
  nvtxRangePop();
#endif
}

void pcrsm0(double vk, double exri, int inpol, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  dcomplex sum1, sum2, sum3, sum4, sumpd;
  dcomplex sum1, sum2, sum3, sum4;
  dcomplex sums1, sums2, sums3, sums4, csam;
  double exdc = exri * exri;
  double ccs = 4.0 * acos(0.0) / (vk * vk);
  double cccs = ccs / exdc;
  int nlemt = c4->nlem + c4->nlem;
  dcomplex *vec_am0m = c1ao->am0m[0];
  csam = -(ccs / (exri * vk)) * 0.5 * I;
  sum2 = cc0;
  sum3 = cc0;
  for (int i14 = 1; i14 <= c4->nlem; i14++) { // GPU portable?
#pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
  for (int i14 = 0; i14 < c4->nlem; i14++) { 
    int ie = i14 + c4->nlem;
    sum2 += (c1ao->am0m[i14 - 1][i14 - 1] + c1ao->am0m[ie - 1][ie - 1]);
    sum3 += (c1ao->am0m[i14 - 1][ie - 1] + c1ao->am0m[ie - 1][i14 - 1]);
    sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]);
    sum3 += (vec_am0m[nlemt*i14 + ie] + vec_am0m[nlemt*ie + i14]);
  } // i14 loop
  double sumpi = 0.0;
  sumpd = cc0;
  int nlemt = c4->nlem + c4->nlem;
  for (int i16 = 1; i16 <= nlemt; i16++) {
    for (int j16 = 1; j16 <= c4->nlem; j16++) {
  dcomplex sumpd = cc0;
#pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
  for (int i16 = 0; i16 < nlemt; i16++) {
    for (int j16 = 0; j16 < c4->nlem; j16++) {
      int je = j16 + c4->nlem;
      double rvalue = real(
			   dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][j16 - 1]
			   + dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][je - 1]
			   dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + j16]
			   + dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + je]
			   );
      sumpi += rvalue;
      sumpd += (
		dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][je - 1]
		+ dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][j16 - 1]
		dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + je]
		+ dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + j16]
		);
    } // j16 loop
  } // i16 loop