Commit 3e620cb4 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use vectorized local matrices in apcra()

parent 3336099a
Loading
Loading
Loading
Loading
+61 −73
Original line number Diff line number Diff line
@@ -166,21 +166,9 @@ void apcra(
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  dcomplex uimtl, uimtls, ca11, ca12, ca21, ca22;
  dcomplex a11, a12, a21, a22, sum1, sum2, fc;
  double ****svw = new double***[le];
  dcomplex ****svs = new dcomplex***[le];
  for (int i = 0; i < le; i++) {
    svw[i] = new double**[3];
    svs[i] = new dcomplex**[3];
    for (int j = 0; j < 3; j++) {
      svw[i][j] = new double*[2];
      svs[i][j] = new dcomplex*[2];
      for (int k = 0; k < 2; k++) {
	svw[i][j][k] = new double[2]();
	svs[i][j][k] = new dcomplex[2]();
      }
    }
  }
  dcomplex a11, a12, a21, a22, fc;
  double *vec_svw = new double[le * 12]();
  dcomplex *vec_svs = new dcomplex[le * 12]();
  int nlem = le * (le + 2);
  for (int l28 = 1; l28 <= le; l28++) {
    int lpo = l28 + 1;
@@ -197,28 +185,28 @@ void apcra(
      if (inpol == 0) {
	double cgs = cgmpo + cgmmo;
	double cgd = cgmpo - cgmmo;
	svw[l28 - 1][ilmp - 1][0][0] = cgs;
	svw[l28 - 1][ilmp - 1][0][1] = cgd;
	svw[l28 - 1][ilmp - 1][1][0] = cgd;
	svw[l28 - 1][ilmp - 1][1][1] = cgs;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1)] = cgs;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 1] = cgd;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 2] = cgd;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 3] = cgs;
      } else { // label 22
	svw[l28 - 1][ilmp - 1][0][0] = cgmpo;
	svw[l28 - 1][ilmp - 1][1][0] = cgmpo;
	svw[l28 - 1][ilmp - 1][0][1] = -cgmmo;
	svw[l28 - 1][ilmp - 1][1][1] = cgmmo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1)] = cgmpo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 2] = cgmpo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 1] = -cgmmo;
	vec_svw[12 * (l28 - 1) + 4 * (ilmp - 1) + 3] = cgmmo;
      }
      // label 26
    } // ilmp loop
  } // l28 loop
  for (int l30 = 1; l30 <= le; l30++) { // 0-init: can be omitted
    for (int ilmp = 1; ilmp <= 3; ilmp++) {
      for (int ipa = 1; ipa <= 2; ipa++) {
	for (int ipamp = 1; ipamp <= 2; ipamp++) {
	  svs[l30 - 1][ilmp - 1][ipa - 1][ipamp - 1] = cc0;
	}
      } // ipa loop
    } // ilmp loop
  } // l30 loop
  // for (int l30 = 1; l30 <= le; l30++) { // 0-init: can be omitted
  //   for (int ilmp = 1; ilmp <= 3; ilmp++) {
  //     for (int ipa = 1; ipa <= 2; ipa++) {
  // 	for (int ipamp = 1; ipamp <= 2; ipamp++) {
  // 	  vec_svs[12 * (l30 - 1) + 4 * (ilmp - 1) + 2 * (ipa - 1) + ipamp - 1] = cc0;
  // 	}
  //     } // ipa loop
  //   } // ilmp loop
  // } // l30 loop
  for (int l58 = 1; l58 <= le; l58 ++) {
    int lpo = l58 + 1;
    int ltpo = l58 + lpo;
@@ -276,19 +264,21 @@ void apcra(
		    double z12 = zpv[ls - 1][ilsmp - 1][0][1];
		    double z21 = zpv[ls - 1][ilsmp - 1][1][0];
		    double z22 = zpv[ls - 1][ilsmp - 1][1][1];
		    svs[l58 - 1][ilmp - 1][0][0] += ((ca11 * a11 * z11
						      + ca11 * a21 * z12
						      + ca21 * a11 * z21
						      + ca21 * a21 * z22) * fc);
		    svs[l58 - 1][ilmp - 1][0][1] += ((ca11 * a12 * z11
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1)] += (
		      (
		         ca11 * a11 * z11 + ca11 * a21 * z12
			 + ca21 * a11 * z21 + ca21 * a21 * z22
		      ) * fc
		    );
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1) + 1] += ((ca11 * a12 * z11
						      + ca11 * a22 * z12
						      + ca21 * a12 * z21
						      + ca21 * a22 * z22) * fc);
		    svs[l58 - 1][ilmp - 1][1][0] += ((ca12 * a11 * z11
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1) + 2] += ((ca12 * a11 * z11
						      + ca12 * a21 * z12
						      + ca22 * a11 * z21
						      + ca22 * a21 * z22) * fc);
		    svs[l58 - 1][ilmp - 1][1][1] += ((ca12 * a12 * z11
		    vec_svs[12 * (l58 - 1) + 4 * (ilmp - 1) + 3] += ((ca12 * a12 * z11
						      + ca12 * a22 * z12
						      + ca22 * a12 * z21
						      + ca22 * a22 * z22) * fc);
@@ -301,8 +291,8 @@ void apcra(
      } // im54 loop
    } // ilmp loop
  } // l58 loop
  sum1 = cc0;
  sum2 = cc0;
  dcomplex sum1 = cc0;
  dcomplex sum2 = cc0;
  for (int l68 = 1; l68 <= le; l68++) {
    //int lpo = l68 + 1;
    //int ltpo = l68 + lpo;
@@ -311,29 +301,29 @@ void apcra(
      if ((l68 == 1 && ilmp == 1) || (l68 == le && ilmp == 3)) continue; // ilmp loop
      if (inpol == 0) {
	sum1 += (
		 svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][1][1]
		 vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 );
	sum2 += (
		 svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][1][1]
		 vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 );
      } else { // label 62
	sum1 += (
		 svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][1][1]
		 vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 );
	sum2 += (
		 svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][1][1]
		 vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
		 + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 );
      } // label 66, ends ilmp loop
    } // ilmp loop
@@ -364,20 +354,18 @@ void apcra(
  }
  
  // Clean memory
  for (int i = le - 1; i > -1; i--) {
    for (int j = 2; j > -1; j--) {
      for (int k = 1; k > -1; k--) {
	delete[] svw[i][j][k];
	delete[] svs[i][j][k];
      }
      delete[] svw[i][j];
      delete[] svs[i][j];
    }
    delete[] svw[i];
    delete[] svs[i];
  }
  delete[] svw;
  delete[] svs;
  // for (int i = le - 1; i > -1; i--) {
  //   for (int j = 2; j > -1; j--) {
  //     delete[] svw[i][j];
  //     delete[] svs[i][j];
  //   }
  //   delete[] svw[i];
  //   delete[] svs[i];
  // }
  // delete[] svw;
  // delete[] svs;
  delete[] vec_svw;
  delete[] vec_svs;
}

#ifdef USE_TARGET_OFFLOAD