Commit 0e564335 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use OpenMP to parallelize loops in apcra()

parent 3e620cb4
Loading
Loading
Loading
Loading
+173 −170
Original line number Diff line number Diff line
@@ -165,17 +165,23 @@ void apcra(
) {
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  dcomplex uimtl, uimtls, ca11, ca12, ca21, ca22;
  dcomplex a11, a12, a21, a22, fc;
  // dcomplex uimtl, uimtls, ca11, ca12, ca21, ca22;
  // dcomplex a11, a12, a21, a22, fc;
  double *vec_svw = new double[le * 12]();
  dcomplex *vec_svs = new dcomplex[le * 12]();
  dcomplex sum1 = cc0;
  dcomplex sum2 = cc0;
  int nlem = le * (le + 2);
#pragma omp parallel
  {
#pragma omp for
    for (int l28 = 1; l28 <= le; l28++) {
      int lpo = l28 + 1;
      int ltpo = lpo + l28;
      double fl = sqrt(1.0 * ltpo);
      for (int ilmp = 1; ilmp <= 3; ilmp++) {
	if ((l28 == 1 && ilmp == 1) || (l28 == le && ilmp == 3)) continue; // ilmp loop
	const int svw_start = 12 * (l28 - 1) + 4 * (ilmp - 1);
	int lmpml = ilmp - 2;
	int lmp = l28 + lmpml;
	double flmp = sqrt(1.0 * (lmp + lmp + 1));
@@ -185,19 +191,19 @@ void apcra(
	if (inpol == 0) {
	  double cgs = cgmpo + cgmmo;
	  double cgd = cgmpo - cgmmo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1)] = cgs;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 1] = cgd;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 2] = cgd;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 3] = cgs;
	  vec_svw[svw_start] = cgs;
	  vec_svw[svw_start + 1] = cgd;
	  vec_svw[svw_start + 2] = cgd;
	  vec_svw[svw_start + 3] = cgs;
	} else { // label 22
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1)] = cgmpo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 2] = cgmpo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 1] = -cgmmo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 3] = cgmmo;
	  vec_svw[svw_start] = cgmpo;
	  vec_svw[svw_start + 2] = cgmpo;
	  vec_svw[svw_start + 1] = -cgmmo;
	  vec_svw[svw_start + 3] = cgmmo;
	}
	// label 26
      } // ilmp loop
  } // l28 loop
    } // l28 parallel loop
    // for (int l30 = 1; l30 <= le; l30++) { // 0-init: can be omitted
    //   for (int ilmp = 1; ilmp <= 3; ilmp++) {
    //     for (int ipa = 1; ipa <= 2; ipa++) {
@@ -207,16 +213,18 @@ void apcra(
    //     } // ipa loop
    //   } // ilmp loop
    // } // l30 loop
#pragma omp for reduction(+: vec_svs[0:12*le])
    for (int l58 = 1; l58 <= le; l58 ++) {
      int lpo = l58 + 1;
      int ltpo = l58 + lpo;
      int imm = l58 * lpo;
      for (int ilmp = 1; ilmp <= 3; ilmp++) {
	if ((l58 == 1 && ilmp == 1) || (l58 == le && ilmp == 3)) continue; // ilmp loop
	const int svs_start = 12 * (l58 - 1) + 4 * (ilmp - 1);
	int lmpml = ilmp - 2;
	int lmp = l58 + lmpml;
	int impmm = lmp * (lmp + 1);
      uimtl = uim * (1.0 * lmpml);
	dcomplex uimtl = uim * (1.0 * lmpml);
	if (lmpml == 0) uimtl = 1.0 + 0.0 * I;
	for (int im54 = 1; im54 <= ltpo; im54++) {
	  int m = im54 - lpo;
@@ -239,7 +247,7 @@ void apcra(
		  int lsmpml = ilsmp - 2;
		  int lsmp = ls + lsmpml;
		  int ismpmm = lsmp * (lsmp + 1);
		uimtls = -uim * (1.0 * lsmpml);
		  dcomplex uimtls = -uim * (1.0 * lsmpml);
		  if (lsmpml == 0) uimtls = 1.0 + 0.0 * I;
		  for (int ims = 1; ims <= lstpo; ims++) {
		    int ms = ims - lspo;
@@ -251,37 +259,43 @@ void apcra(
		      int ismp = ismpmm + msmp;
		      int ismpe = ismp + nlem;
		      double cgcs = cg1(lsmpml, mu, ls, ms);
		    fc = (uimtl * uimtls) * (cgc * cgcs);
		    ca11 = dconjg(am0m[is - 1][i - 1]);
		    ca12 = dconjg(am0m[is - 1][ie - 1]);
		    ca21 = dconjg(am0m[ise - 1][i - 1]);
		    ca22 = dconjg(am0m[ise - 1][ie - 1]);
		    a11 = am0m[ismp - 1][imp - 1];
		    a12 = am0m[ismp - 1][impe - 1];
		    a21 = am0m[ismpe - 1][imp - 1];
		    a22 = am0m[ismpe - 1][impe - 1];
		      dcomplex fc = (uimtl * uimtls) * (cgc * cgcs);
		      dcomplex ca11 = dconjg(am0m[is - 1][i - 1]);
		      dcomplex ca12 = dconjg(am0m[is - 1][ie - 1]);
		      dcomplex ca21 = dconjg(am0m[ise - 1][i - 1]);
		      dcomplex ca22 = dconjg(am0m[ise - 1][ie - 1]);
		      dcomplex a11 = am0m[ismp - 1][imp - 1];
		      dcomplex a12 = am0m[ismp - 1][impe - 1];
		      dcomplex a21 = am0m[ismpe - 1][imp - 1];
		      dcomplex a22 = am0m[ismpe - 1][impe - 1];
		      double z11 = zpv[ls - 1][ilsmp - 1][0][0];
		      double z12 = zpv[ls - 1][ilsmp - 1][0][1];
		      double z21 = zpv[ls - 1][ilsmp - 1][1][0];
		      double z22 = zpv[ls - 1][ilsmp - 1][1][1];
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1)] += (
		      vec_svs[svs_start] += (
		        (
		           ca11 * a11 * z11 + ca11 * a21 * z12
			   + ca21 * a11 * z21 + ca21 * a21 * z22
		        ) * fc
		      );
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1) + 1] += ((ca11 * a12 * z11
						      + ca11 * a22 * z12
						      + ca21 * a12 * z21
						      + ca21 * a22 * z22) * fc);
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1) + 2] += ((ca12 * a11 * z11
						      + ca12 * a21 * z12
						      + ca22 * a11 * z21
						      + ca22 * a21 * z22) * fc);
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1) + 3] += ((ca12 * a12 * z11
						      + ca12 * a22 * z12
						      + ca22 * a12 * z21
						      + ca22 * a22 * z22) * fc);
		      vec_svs[svs_start + 1] += (
		        (
		          ca11 * a12 * z11 + ca11 * a22 * z12
			  + ca21 * a12 * z21 + ca21 * a22 * z22
		        ) * fc
		      );
		      vec_svs[svs_start + 2] += (
		        (
		          ca12 * a11 * z11 + ca12 * a21 * z12
			  + ca22 * a11 * z21 + ca22 * a21 * z22
		        ) * fc
		      );
		      vec_svs[svs_start + 3] += (
		        (
		          ca12 * a12 * z11 + ca12 * a22 * z12
			  + ca22 * a12 * z21 + ca22 * a22 * z22
		        ) * fc
	              );
		    } // ends ims loop
		  } // ims loop
		} // ilsmp loop
@@ -291,8 +305,7 @@ void apcra(
	} // im54 loop
      } // ilmp loop
    } // l58 loop
  dcomplex sum1 = cc0;
  dcomplex sum2 = cc0;
#pragma omp for reduction(+: sum1, sum2)
    for (int l68 = 1; l68 <= le; l68++) {
      //int lpo = l68 + 1;
      //int ltpo = l68 + lpo;
@@ -328,6 +341,7 @@ void apcra(
	} // label 66, ends ilmp loop
      } // ilmp loop
    } // l68 loop
  } // OMP parallel region
  const double half_pi = acos(0.0);
  double cofs = half_pi * 2.0 / sqk;
  gaprm[0][0] = 0.0;
@@ -352,18 +366,7 @@ void apcra(
    gappm[2][0] = cc0;
    gappm[2][1] = cc0;
  }
  
  // Clean memory
  // for (int i = le - 1; i > -1; i--) {
  //   for (int j = 2; j > -1; j--) {
  //     delete[] svw[i][j];
  //     delete[] svs[i][j];
  //   }
  //   delete[] svw[i];
  //   delete[] svs[i];
  // }
  // delete[] svw;
  // delete[] svs;
  delete[] vec_svw;
  delete[] vec_svs;
}