Commit 68a9a01c authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

complete parallelisation of raba()

parent 487de478
Loading
Loading
Loading
Loading
+50 −308
Original line number Original line Diff line number Diff line
@@ -1926,28 +1926,38 @@ void raba(
  ctqce[1] = ctqce[0]+3;
  ctqce[1] = ctqce[0]+3;
  ctqcs[1] = ctqcs[0]+3;
  ctqcs[1] = ctqcs[0]+3;
  dcomplex *vec_am0m = am0m[0];
  dcomplex *vec_am0m = am0m[0];
  dcomplex *vec_w = w[0];
#ifdef USE_NVTX
  nvtxRangePush("raba outer loop 1");
#endif
#pragma omp parallel for
#pragma omp parallel for
  for (int i20 = 1; i20 <= nlemt; i20++) {
  for (int i20 = 1; i20 <= nlemt; i20++) {
    int i = i20 - 1;
    int i = i20 - 1;
    dcomplex c1 = cc0;
    dcomplex c1 = cc0;
    dcomplex c2 = cc0;
    dcomplex c2 = cc0;
#ifdef USE_NVTX
  nvtxRangePush("raba inner loop 1");
#endif
#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
    for (int j10 = 1; j10 <= nlemt; j10++) {
    for (int j10 = 1; j10 <= nlemt; j10++) {
      int j = j10 - 1;
      int j = j10 - 1;
      c1 += (vec_am0m[i*nlemt+j] * w[j][0]);
      c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]);
      c2 += (vec_am0m[i*nlemt+j] * w[j][1]);
      c2 += (vec_am0m[i*nlemt+j] * vec_w[4*j+1]);
      // c1 += (am0m[i][j] * w[j][0]);
      // c2 += (am0m[i][j] * w[j][1]);
    } // j10 loop
    } // j10 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
    vec_a[2*i] = c1;
    vec_a[2*i] = c1;
    vec_a[2*i+1] = c2;
    vec_a[2*i+1] = c2;
    // a[i][0] = c1;
    // a[i][1] = c2;
  } //i20 loop
  } //i20 loop
  int jpo = 2;
#ifdef USE_NVTX
  for (int ipo70 = 1; ipo70 <= 2; ipo70++) {
  nvtxRangePop();
    if (ipo70 == 2) jpo = 1;
#endif
    int ipo = ipo70 - 1;
#ifdef USE_NVTX
  nvtxRangePush("raba outer loop 2");
#endif
#pragma omp teams distribute parallel for
  for (int ipo = 0; ipo < 2; ipo++) {
    int jpo = 1 - ipo;
    int jpo = 1 - ipo;
    ctqce[ipo][0] = cc0;
    ctqce[ipo][0] = cc0;
    ctqce[ipo][1] = cc0;
    ctqce[ipo][1] = cc0;
@@ -1975,6 +1985,9 @@ void raba(
    dcomplex &tqcps2 = tqcps[ipo][2];
    dcomplex &tqcps2 = tqcps[ipo][2];
    int kmax = le*(le+2);
    int kmax = le*(le+2);
    // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed)
    // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed)
#ifdef USE_NVTX
  nvtxRangePush("raba inner loop 2");
#endif
#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
    for (int k = 1; k<=kmax; k++) {
    for (int k = 1; k<=kmax; k++) {
      int l60 = (int) sqrt(k+1);
      int l60 = (int) sqrt(k+1);
@@ -1989,9 +2002,8 @@ void raba(
      }
      }
      int lpo = l60 + 1;
      int lpo = l60 + 1;
      int il = l60 * lpo;
      int il = l60 * lpo;
      // int ltpo = l60 + lpo;
      int m = im60 - lpo;
      int m = im60 - lpo;
      int i = m + il;
      int i = m + il - 1;
      int ie = i + nlem;
      int ie = i + nlem;
      int mmmu = m + 1;
      int mmmu = m + 1;
      int mmmmu = (mmmu > 0) ? mmmu : -mmmu;
      int mmmmu = (mmmu > 0) ? mmmu : -mmmu;
@@ -2001,13 +2013,13 @@ void raba(
      dcomplex aca;
      dcomplex aca;
      dcomplex acap;
      dcomplex acap;
	if (mmmmu <= l60) {
	if (mmmmu <= l60) {
	  int immu = mmmu + il;
	  int immu = mmmu + il - 1;
	  int immue = immu + nlem;
	  int immue = immu + nlem;
	  rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i;
	  rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i;
	  acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][ipo];
	  acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
	  acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][jpo];
	  acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
	  aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+ipo];
	  aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
	  acap = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+jpo];
	  acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
	  ctqce0 += (acw * rmu);
	  ctqce0 += (acw * rmu);
	  tqcpe0 += (acwp * rmu);
	  tqcpe0 += (acwp * rmu);
	  ctqcs0 += (aca * rmu);
	  ctqcs0 += (aca * rmu);
@@ -2015,10 +2027,10 @@ void raba(
	}
	}
	// label 30
	// label 30
	rmu = -1.0 * m;
	rmu = -1.0 * m;
	acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[i - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[ie - 1][ipo];
	acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo];
	acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[i - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[ie - 1][jpo];
	acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo];
	aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(i - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(ie - 1)+ipo];
	aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo];
	acap = dconjg(vec_a[1*(i - 1)+ipo]) * vec_a[2*(i - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(ie - 1)+jpo];
	acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo];
	ctqce1 += (acw * rmu);
	ctqce1 += (acw * rmu);
	tqcpe1 += (acwp * rmu);
	tqcpe1 += (acwp * rmu);
	ctqcs1 += (aca * rmu);
	ctqcs1 += (aca * rmu);
@@ -2026,20 +2038,29 @@ void raba(
	mmmu = m - 1;
	mmmu = m - 1;
	mmmmu = (mmmu > 0) ? mmmu : -mmmu;
	mmmmu = (mmmu > 0) ? mmmu : -mmmu;
	if (mmmmu <= l60) {
	if (mmmmu <= l60) {
	  int immu = mmmu + il;
	  int immu = mmmu + il - 1;
	  int immue = immu + nlem;
	  int immue = immu + nlem;
	  rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i;
	  rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i;
	  acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][ipo];
	  acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
	  acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][jpo];
	  acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
	  aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+ipo];
	  aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
	  acap = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+jpo];
	  acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
	  ctqce2 += (acw * rmu);
	  ctqce2 += (acw * rmu);
	  tqcpe2 += (acwp * rmu);
	  tqcpe2 += (acwp * rmu);
	  ctqcs2 += (aca * rmu);
	  ctqcs2 += (aca * rmu);
	  tqcps2 += (acap * rmu);
	  tqcps2 += (acap * rmu);
	} // ends if clause
	} // ends if clause
    } // k loop (previously the l60 and im60 loops
    } // k loop (previously the l60 and im60 loops
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  } // ipo70 loop
  } // ipo70 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
#ifdef USE_NVTX
  nvtxRangePush("raba loop 3");
#endif
#pragma omp teams distribute parallel for simd
#pragma omp teams distribute parallel for simd
  for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
  for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
    int ipo = ipo78 - 1;
    int ipo = ipo78 - 1;
@@ -2062,6 +2083,9 @@ void raba(
    tqcps[ipo][1] = -(c1 + c3) * (uim * sq2i);
    tqcps[ipo][1] = -(c1 + c3) * (uim * sq2i);
    tqcps[ipo][2] = -c2;
    tqcps[ipo][2] = -c2;
  } // ipo78 loop
  } // ipo78 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  delete[] a[0];
  delete[] a[0];
  delete[] ctqce[0];
  delete[] ctqce[0];
  delete[] ctqcs[0];
  delete[] ctqcs[0];
@@ -2070,288 +2094,6 @@ void raba(
  delete[] ctqcs;
  delete[] ctqcs;
}
}


// void raba_new(
// 	  int le, dcomplex **am0m, dcomplex **w, double **tqce,
// 	  dcomplex **tqcpe, double **tqcs, dcomplex **tqcps
// ) {
//   dcomplex **a, **ctqce, **ctqcs;
//   dcomplex acw, acwp, aca, acap, c1, c2, c3;
//   const dcomplex cc0 = 0.0 + 0.0 * I;
//   const dcomplex uim = 0.0 + 1.0 * I;
//   const double sq2i = 1.0 / sqrt(2.0);
//   int nlem = le * (le + 2);
//   const int nlemt = nlem + nlem;
//   a = new dcomplex*[nlemt];
//   ctqce = new dcomplex*[2];
//   ctqcs = new dcomplex*[2];
//   a[0] = new dcomplex[nlemt*2]();
//   dcomplex *vec_a = a[0];
//   ctqce[0] = new dcomplex[6]();
//   ctqcs[0] = new dcomplex[6]();
//   for (int ai = 1; ai < nlemt; ai++) a[ai] = a[0]+ai*2;
//   ctqce[1] = ctqce[0]+3;
//   ctqcs[1] = ctqcs[0]+3;
//   dcomplex *vec_am0m = am0m[0];
//   // I cannot vectorise easily the access to w, since its size can follow either li or le, and I don't know here
//   #pragma omp parallel for
//   for (int i20 = 1; i20 <= nlemt; i20++) {
//     int i = i20 - 1;
//     dcomplex c1 = cc0;
//     dcomplex c2 = cc0;
//     #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
//     for (int j10 = 1; j10 <= nlemt; j10++) {
//       int j = j10 - 1;
//       // this is actually a matrix multiplication a =  am0m x w, I could substitute the whole nested loop with a single BLAS level 3 call
//       c1 += (vec_am0m[i*nlemt+j] * w[j][0]);
//       c2 += (vec_am0m[i*nlemt+j] * w[j][1]);
//     } // j10 loop
//     vec_a[2*i] = c1;
//     vec_a[2*i+1] = c2;
//   } //i20 loop
//   //int jpo = 2;
//   // for (int ipo70 = 1; ipo70 <= 2; ipo70++) {
//   //   if (ipo70 == 2) jpo = 1;
//   //   int ipo = ipo70 - 1;
//   // these are 2 x 3 arrays, in principle there should be a double loop over their two indices, but the second was explicitly unrolled here. This is senseless, either unroll both indices for speed, or unroll none for clarity, doing it halfway achieves neither
//   // ctqce[ipo][0] = cc0;
//   // ctqce[ipo][1] = cc0;
//   // ctqce[ipo][2] = cc0;
//   // tqcpe[ipo][0] = cc0;
//   // tqcpe[ipo][1] = cc0;
//   // tqcpe[ipo][2] = cc0;
//   // ctqcs[ipo][0] = cc0;
//   // ctqcs[ipo][1] = cc0;
//   // ctqcs[ipo][2] = cc0;
//   // tqcps[ipo][0] = cc0;
//   // tqcps[ipo][1] = cc0;
//   // tqcps[ipo][2] = cc0;
//   // so let's go for it all the way, unroll both dimensions, use auxiliary scalar variables for the reduction, to declare them in the omp pragma
//   dcomplex ctqce00 = cc0;
//   dcomplex ctqce01 = cc0;
//   dcomplex ctqce02 = cc0;
//   dcomplex ctqce10 = cc0;
//   dcomplex ctqce11 = cc0;
//   dcomplex ctqce12 = cc0;
//   dcomplex tqcpe00 = cc0;
//   dcomplex tqcpe01 = cc0;
//   dcomplex tqcpe02 = cc0;
//   dcomplex tqcpe10 = cc0;
//   dcomplex tqcpe11 = cc0;
//   dcomplex tqcpe12 = cc0;
//   dcomplex ctqcs00 = cc0;
//   dcomplex ctqcs01 = cc0;
//   dcomplex ctqcs02 = cc0;
//   dcomplex ctqcs10 = cc0;
//   dcomplex ctqcs11 = cc0;
//   dcomplex ctqcs12 = cc0;
//   dcomplex tqcps00 = cc0;
//   dcomplex tqcps01 = cc0;
//   dcomplex tqcps02 = cc0;
//   dcomplex tqcps10 = cc0;
//   dcomplex tqcps11 = cc0;
//   dcomplex tqcps12 = cc0;
//   // To parallelise, I run a linearised loop directly over k
//   // working out the algebra, it turns out that
//   // k = l60*l60-1+im60
//   // we invert this to find
//   // l60 = (int) sqrt(k+1) and im60 = k - l60*60+1
//   // but if it results im60 = 0, then we set l60 = l60-1 and im60 = 2*l60+1
//   // furthermore if it results im60 > 2*l60+1, then we set
//   // im60 = im60 -(2*l60+1) and l60 = l60+1 (there was a rounding error in a nearly exact root)
//   // with the following kmax, l60 goes from 1 to le, and im60 from 1 to 2*l60+1
//   int kmax = le*(le+2);
//   //#pragma omp target teams distribute parallel for simd reduction(+:ctqce00, ctqce01, ctqce02, ctqce10, ctqce11, ctqce12, ctqcs00, ctqcs01, ctqcs02, ctqcs10, ctqcs11, ctqcs12, tqcpe00, tqcpe01, tqcpe02, tqcpe10, tqcpe11, tqcpe12, tqcps00, tqcps01, tqcps02, tqcps10, tqcps11, tqcps12)
//   for (int k = 1; k<=kmax; k++) {
//     int l60 = (int) sqrt(k+1);
//     int im60 = k - (l60*l60) + 1;
//     if (im60 == 0) {
//       l60--;
//       im60 = 2*l60+1;
//     }
//     else if (im60 > 2*l60 + 1) {
//       im60 -= 2*l60 + 1;
//       l60++;
//     }
//     // I have all the indices in my linearised loop
//     int lpo = l60 + 1;
//     int il = l60 * lpo;
//     int m = im60 - lpo;
//     int i = m + il;
//     int ie = i + nlem;
//     int mmmu = m + 1;
//     int mmmmu = (mmmu > 0) ? mmmu : -mmmu;
//     int im1p2 = (i - 1)*2;
//     int iem1p2 = (ie - 1)*2;
//     double  rmu;
//     dcomplex acw0;
//     dcomplex acw1;
//     dcomplex acwp0;
//     dcomplex acwp1;
//     dcomplex aca0;
//     dcomplex aca1;
//     dcomplex acap0;
//     dcomplex acap1;
//     if (mmmmu <= l60) {
//       int immu = mmmu + il;
//       int immue = immu + nlem;
//       int immum1 = immu-1;
//       int vecimmu = (immum1)*2;
//       int immuem1 = immue-1;
//       int vecimmue = (immuem1)*2;
//       dcomplex *vec_w = w[immum1];
//       dcomplex *vec_we = w[immuem1];
//       rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i;
//       acw0 = dconjg(a[i - 1][0]) * w[immu - 1][0] + dconjg(a[ie - 1][0]) * w[immue - 1][0];
//       acw1 = dconjg(a[i - 1][1]) * w[immu - 1][1] + dconjg(a[ie - 1][1]) * w[immue - 1][1];
//       acwp0 = dconjg(a[i - 1][0]) * w[immu - 1][1] + dconjg(a[ie - 1][0]) * w[immue - 1][1];
//       acwp1 = dconjg(a[i - 1][1]) * w[immu - 1][0] + dconjg(a[ie - 1][1]) * w[immue - 1][0];
//       aca0 = dconjg(a[i - 1][0]) * a[immu - 1][0] + dconjg(a[ie - 1][0]) * a[immue - 1][0];
//       aca1 = dconjg(a[i - 1][1]) * a[immu - 1][1] + dconjg(a[ie - 1][1]) * a[immue - 1][1];
//       acap0 = dconjg(a[i - 1][0]) * a[immu - 1][1] + dconjg(a[ie - 1][0]) * a[immue - 1][1];
//       acap1 = dconjg(a[i - 1][1]) * a[immu - 1][0] + dconjg(a[ie - 1][1]) * a[immue - 1][0];
//       // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0];
//       // acw1 = dconjg(vec_a[im1p2+1]) * vec_w[1] + dconjg(vec_a[iem1p2+1]) * vec_we[1];
//       // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1];
//       // acwp1 = dconjg(vec_a[im1p2+1]) * vec_w[0] + dconjg(vec_a[im1p2+1]) * vec_we[0];
//       // aca0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue];
//       // aca1 = dconjg(vec_a[im1p2+1]) * vec_a[vecimmu+1] + dconjg(vec_a[iem1p2+1]) * vec_a[vecimmue+1];
//       // acap0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu+1] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue+1];
//       // acap1 = dconjg(vec_a[im1p2+1]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2+1]) * vec_a[vecimmue];
//       ctqce00 += (acw0 * rmu);
//       ctqce10 += (acw0 * rmu);
//       tqcpe00 += (acwp0 * rmu);
//       tqcpe10 += (acwp1 * rmu);
//       ctqcs00 += (aca0 * rmu);
//       ctqcs10 += (aca1 * rmu);
//       tqcps00 += (acap0 * rmu);
//       tqcps10 += (acap1 * rmu);
//     }
//     // label 30
//     dcomplex *vec_w = w[i-1];
//     dcomplex *vec_we = w[ie-1];
//     rmu = -1.0 * m;
//     acw0 = dconjg(a[i - 1][0]) * w[i - 1][0] + dconjg(a[ie - 1][0]) * w[ie - 1][0];
//     acw1 = dconjg(a[i - 1][1]) * w[i - 1][1] + dconjg(a[ie - 1][1]) * w[ie - 1][1];
//     acwp0 = dconjg(a[i - 1][0]) * w[i - 1][1] + dconjg(a[ie - 1][0]) * w[ie - 1][1];
//     acwp1 = dconjg(a[i - 1][1]) * w[i - 1][0] + dconjg(a[ie - 1][1]) * w[ie - 1][0];
//     aca0 = dconjg(a[i - 1][0]) * a[i - 1][0] + dconjg(a[ie - 1][0]) * a[ie - 1][0];
//     aca1 = dconjg(a[i - 1][1]) * a[i - 1][1] + dconjg(a[ie - 1][1]) * a[ie - 1][1];
//     acap0 = dconjg(a[i - 1][0]) * a[i - 1][1] + dconjg(a[ie - 1][0]) * a[ie - 1][1];
//     acap1 = dconjg(a[i - 1][1]) * a[i - 1][0] + dconjg(a[ie - 1][1]) * a[ie - 1][0];
//     // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0];
//     // acw1 = dconjg(vec_a[im1p2 + 1]) * vec_w[1] + dconjg(vec_a[iem1p2 + 1]) * vec_we[1];
//     // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1];
//     // acwp1 = dconjg(vec_a[im1p2 + 1]) * vec_w[0] + dconjg(vec_a[iem1p2 + 1]) * vec_we[0];
//     // aca0 = dconjg(vec_a[im1p2]) * vec_a[im1p2] + dconjg(vec_a[iem1p2]) * vec_a[iem1p2];
//     // aca1 = dconjg(vec_a[im1p2 + 1]) * vec_a[im1p2 + 1] + dconjg(vec_a[iem1p2 + 1]) * vec_a[iem1p2 + 1];
//     // acap0 = dconjg(vec_a[im1p2]) * vec_a[im1p2 + 1] + dconjg(vec_a[iem1p2]) * vec_a[iem1p2 + 1];
//     // acap1 = dconjg(vec_a[im1p2 + 1]) * vec_a[im1p2] + dconjg(vec_a[iem1p2 + 1]) * vec_a[iem1p2];
//     ctqce01 += (acw0 * rmu);
//     ctqce11 += (acw1 * rmu);
//     tqcpe01 += (acwp0 * rmu);
//     tqcpe11 += (acwp1 * rmu);
//     ctqcs01 += (aca0 * rmu);
//     ctqcs11 += (aca1 * rmu);
//     tqcps01 += (acap0 * rmu);
//     tqcps11 += (acap1 * rmu);
//     mmmu = m - 1;
//     mmmmu = (mmmu > 0) ? mmmu : -mmmu;
//     if (mmmmu <= l60) {
//       int immu = mmmu + il;
//       int immue = immu + nlem;
//       int immum1 = immu-1;
//       int vecimmu = (immum1)*2;
//       int immuem1 = immue-1;
//       int vecimmue = (immuem1)*2;
//       dcomplex *vec_w = w[immum1];
//       dcomplex *vec_we = w[immuem1];
//       rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i;
//       acw0 = dconjg(a[i - 1][0]) * w[immu - 1][0] + dconjg(a[ie - 1][0]) * w[immue - 1][0];
//       acw1 = dconjg(a[i - 1][1]) * w[immu - 1][1] + dconjg(a[ie - 1][1]) * w[immue - 1][1];
//       acwp0 = dconjg(a[i - 1][0]) * w[immu - 1][1] + dconjg(a[ie - 1][0]) * w[immue - 1][1];
//       acwp1 = dconjg(a[i - 1][1]) * w[immu - 1][0] + dconjg(a[ie - 1][1]) * w[immue - 1][0];
//       aca0 = dconjg(a[i - 1][0]) * a[immu - 1][0] + dconjg(a[ie - 1][0]) * a[immue - 1][0];
//       aca1 = dconjg(a[i - 1][1]) * a[immu - 1][1] + dconjg(a[ie - 1][1]) * a[immue - 1][1];
//       acap0 = dconjg(a[i - 1][0]) * a[immu - 1][1] + dconjg(a[ie - 1][0]) * a[immue - 1][1];
//       acap1 = dconjg(a[i - 1][1]) * a[immu - 1][0] + dconjg(a[ie - 1][1]) * a[immue - 1][0];
//       // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0];
//       // acw1 = dconjg(vec_a[im1p2 + 1]) * vec_w[1] + dconjg(vec_a[iem1p2 + 1]) * vec_we[1];
//       // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1];
//       // acwp1 = dconjg(vec_a[im1p2 + 1]) * vec_w[0] + dconjg(vec_a[iem1p2 + 1]) * vec_we[0];
//       // aca0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue];
//       // aca1 = dconjg(vec_a[im1p2 + 1]) * vec_a[vecimmu + 1] + dconjg(vec_a[iem1p2 + 1]) * vec_a[vecimmue + 1];
//       // acap0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu + 1] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue + 1];
//       // acap1 = dconjg(vec_a[im1p2 + 1]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2 + 1]) * vec_a[vecimmue];
//       ctqce02 += (acw0 * rmu);
//       ctqce12 += (acw1 * rmu);
//       tqcpe02 += (acwp0 * rmu);
//       tqcpe12 += (acwp1 * rmu);
//       ctqcs02 += (aca0 * rmu);
//       ctqcs12 += (aca1 * rmu);
//       tqcps02 += (acap0 * rmu);
//       tqcps12 += (acap1 * rmu);
//     } // ends if clause
//   } // k loop (previously the l60 and im60 loops
//   ctqce[0][0] = ctqce00;
//   ctqce[0][1] = ctqce01;
//   ctqce[0][2] = ctqce02;
//   ctqce[1][0] = ctqce10;
//   ctqce[1][1] = ctqce11;
//   ctqce[1][2] = ctqce12;

//   ctqcs[0][0] = ctqcs00;
//   ctqcs[0][1] = ctqcs01;
//   ctqcs[0][2] = ctqcs02;
//   ctqcs[1][0] = ctqcs10;
//   ctqcs[1][1] = ctqcs11;
//   ctqcs[1][2] = ctqcs12;

//   tqcpe[0][0] = tqcpe00;
//   tqcpe[0][1] = tqcpe01;
//   tqcpe[0][2] = tqcpe02;
//   tqcpe[1][0] = tqcpe10;
//   tqcpe[1][1] = tqcpe11;
//   tqcpe[1][2] = tqcpe12;

//   tqcps[0][0] = tqcps00;
//   tqcps[0][1] = tqcps01;
//   tqcps[0][2] = tqcps02;
//   tqcps[1][0] = tqcps10;
//   tqcps[1][1] = tqcps11;
//   tqcps[1][2] = tqcps12;

//   // this is a ridiculously small loop (3 iterations!) of assignments. Would be not worth parallelising, but since it's easy do it on the cpus
//   //#pragma omp teams distribute parallel for simd
//   for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
//     int ipo = ipo78 - 1;
//     tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i;
//     tqce[ipo][1] = real((ctqce[ipo][0] + ctqce[ipo][2]) * uim) * sq2i;
//     tqce[ipo][2] = real(ctqce[ipo][1]);
//     c1 = tqcpe[ipo][0];
//     c2 = tqcpe[ipo][1];
//     c3 = tqcpe[ipo][2];
//     tqcpe[ipo][0] = (c1 - c3) * sq2i;
//     tqcpe[ipo][1] = (c1 + c3) * (uim * sq2i);
//     tqcpe[ipo][2] = c2;
//     tqcs[ipo][0] = -sq2i * real(ctqcs[ipo][0] - ctqcs[ipo][2]);
//     tqcs[ipo][1] = -sq2i * real((ctqcs[ipo][0] + ctqcs[ipo][2]) * uim);
//     tqcs[ipo][2] = -1.0 * real(ctqcs[ipo][1]);
//     c1 = tqcps[ipo][0];
//     c2 = tqcps[ipo][1];
//     c3 = tqcps[ipo][2];
//     tqcps[ipo][0] = -(c1 - c3) * sq2i;
//     tqcps[ipo][1] = -(c1 + c3) * (uim * sq2i);
//     tqcps[ipo][2] = -c2;
//   } // ipo78 loop
//   // Clean memory
//   delete[] a[0];
//   delete[] ctqce[0];
//   delete[] ctqcs[0];
//   delete[] a;
//   delete[] ctqce;
//   delete[] ctqcs;
// }

void rftr(
void rftr(
	  double *u, double *up, double *un, double *gapv, double extins, double scatts,
	  double *u, double *up, double *un, double *gapv, double extins, double scatts,
	  double &rapr, double &cosav, double &fp, double &fn, double &fk, double &fx,
	  double &rapr, double &cosav, double &fp, double &fn, double &fk, double &fx,