Commit 96eb7327 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Revert to non-parallel apcra() and minimally parallel apc()

parent 0e564335
Loading
Loading
Loading
Loading
+244 −242
Original line number Diff line number Diff line
@@ -56,22 +56,21 @@ void apc(
  dcomplex *vec_ac, *vec_gap;
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  dcomplex uimmp, summ, sume, suem, suee, summp, sumep;
  dcomplex suemp, sueep;
  // dcomplex uimmp, summ, sume, suem, suee, summp, sumep;
  // dcomplex suemp, sueep;
  double cof = 1.0 / sqk;
  double cimu = cof / sqrt(2.0);
  int nlem = le * (le + 2);
  const int nlemt = nlem + nlem;
  const int nlemf = 2 * nlemt;
  vec_ac = new dcomplex[nlemf]();
  vec_ac = new dcomplex[2 * nlemt]();
  vec_gap = new dcomplex[6]();
#pragma omp parallel for reduction(+: vec_ac[0:nlemf])
  for (np_int ij = 0; ij < nlemt * nlemt; ij++) {
    np_int j45 = ij / nlemt;
    np_int i45 = ij % nlemt;
#pragma omp parallel for simd
  for (int j45 = 0; j45 < nlemt; j45++) {
    for (int i45 = 0; i45 < nlemt; i45++) {
      vec_ac[2 * j45] += (am0m[j45][i45] * w[i45][0]);
      vec_ac[2 * j45 + 1] += (am0m[j45][i45] * w[i45][1]);
  } // ij loop
    } //i45 loop
  } //j45 loop
  for (int imu90 = 1; imu90 <=3; imu90++) {
    int mu = imu90 - 2;
    gapp[imu90 - 1][0] = cc0;
@@ -84,7 +83,7 @@ void apc(
	if ((l80 == 1 && ilmp == 1) || (l80 == le && ilmp == 3)) continue; // ilmp loop
	int lmpml = ilmp - 2;
	int lmp = l80 + lmpml;
	uimmp = (-1.0 * lmpml) * uim;
	dcomplex uimmp = (-1.0 * lmpml) * uim;
	int impmmmp = lmp * (lmp + 1);
	for (int im70 = 1; im70 <= ltpo; im70++) {
	  int m = im70 - lpo;
@@ -96,17 +95,18 @@ void apc(
	    int imp = impmmmp + mmp;
	    int impe = imp + nlem;
	    double cgc = cg1(lmpml, mu, l80, m);
	    int jpo = 1;
	    for (int ipo = 0; ipo < 2; ipo++) {
	      if (ipo == 1) jpo = 0;
	      summ = dconjg(vec_ac[2 * (i - 1) + ipo]) * vec_ac[2 * (imp - 1) + ipo];
	      sume = dconjg(vec_ac[2 * (i - 1) + ipo]) * vec_ac[2 * (impe - 1) + ipo];
	      suem = dconjg(vec_ac[2 * (ie - 1) + ipo]) * vec_ac[2 * (imp - 1) + ipo];
	      suee = dconjg(vec_ac[2 * (ie - 1) + ipo]) * vec_ac[2 * (impe - 1) + ipo];
	      summp = dconjg(vec_ac[2 * (i - 1) + jpo]) * vec_ac[2 * (imp - 1) + ipo];
	      sumep = dconjg(vec_ac[2 * (i - 1) + jpo]) * vec_ac[2 * (impe - 1) + ipo];
	      suemp = dconjg(vec_ac[2 * (ie - 1) + jpo]) * vec_ac[2 * (imp - 1) + ipo];
	      sueep = dconjg(vec_ac[2 * (ie - 1) + jpo]) * vec_ac[2 * (impe - 1) +ipo];
	    int jpo = 2;
	    for (int ipo = 1; ipo <= 2; ipo++) {
	      dcomplex summ, sume, suem, suee, summp, sumep, suemp, sueep;
	      if (ipo == 2) jpo = 1;
	      summ = dconjg(vec_ac[2 * (i - 1) + ipo - 1]) * vec_ac[2 * (imp - 1) + ipo - 1];
	      sume = dconjg(vec_ac[2 * (i - 1) + ipo - 1]) * vec_ac[2 * (impe - 1) + ipo - 1];
	      suem = dconjg(vec_ac[2 * (ie - 1) + ipo - 1]) * vec_ac[2 * (imp - 1) + ipo - 1];
	      suee = dconjg(vec_ac[2 * (ie - 1) + ipo - 1]) * vec_ac[2 * (impe - 1) + ipo - 1];
	      summp = dconjg(vec_ac[2 * (i - 1) + jpo - 1]) * vec_ac[2 * (imp - 1) + ipo - 1];
	      sumep = dconjg(vec_ac[2 * (i - 1) + jpo - 1]) * vec_ac[2 * (impe - 1) + ipo - 1];
	      suemp = dconjg(vec_ac[2 * (ie - 1) + jpo - 1]) * vec_ac[2 * (imp - 1) + ipo - 1];
	      sueep = dconjg(vec_ac[2 * (ie - 1) + jpo - 1]) * vec_ac[2 * (impe - 1) + ipo - 1];
	      if (lmpml != 0) {
		summ *= uimmp;
		sume *= uimmp;
@@ -118,7 +118,7 @@ void apc(
		sueep *= uimmp;
	      }
	      // label 55
	      vec_gap[2 * (imu90 - 1) + ipo] += (
	      vec_gap[2 * (imu90 - 1) + ipo - 1] += (
		(
		  summ * zpv[l80 - 1][ilmp - 1][0][0]
		  + sume * zpv[l80 - 1][ilmp - 1][0][1]
@@ -126,7 +126,7 @@ void apc(
		  + suee * zpv[l80 - 1][ilmp - 1][1][1]
		) * cgc
	      );
	      gapp[imu90 - 1][ipo] += (
	      gapp[imu90 - 1][ipo - 1] += (
		(
		  summp * zpv[l80 - 1][ilmp - 1][0][0]
		  + sumep * zpv[l80 - 1][ilmp - 1][0][1]
@@ -140,19 +140,20 @@ void apc(
      } // ilmp loop
    } // l80 loop
  } // imu90 loop
  for (int ipo95 = 0; ipo95 < 2; ipo95++) {
    sume = vec_gap[ipo95] * cimu;
    suee = vec_gap[2 + ipo95] * cof;
    suem = vec_gap[4 + ipo95] * cimu;
    gapr[0][ipo95] = real(sume - suem);
    gapr[1][ipo95] = real((sume + suem) * uim);
    gapr[2][ipo95] = real(suee);
    sumep = gapp[0][ipo95] * cimu;
    sueep = gapp[1][ipo95] * cof;
    suemp = gapp[2][ipo95] * cimu;
    gapp[0][ipo95] = sumep - suemp;
    gapp[1][ipo95] = (sumep + suemp) * uim;
    gapp[2][ipo95] = sueep;
  for (int ipo95 = 1; ipo95 <= 2; ipo95++) {
    dcomplex sume, suee, suem, sumep, sueep, suemp;
    sume = vec_gap[ipo95 - 1] * cimu;
    suee = vec_gap[2 + ipo95 - 1] * cof;
    suem = vec_gap[4 + ipo95 - 1] * cimu;
    gapr[0][ipo95 - 1] = real(sume - suem);
    gapr[1][ipo95 - 1] = real((sume + suem) * uim);
    gapr[2][ipo95 - 1] = real(suee);
    sumep = gapp[0][ipo95 - 1] * cimu;
    sueep = gapp[1][ipo95 - 1] * cof;
    suemp = gapp[2][ipo95 - 1] * cimu;
    gapp[0][ipo95 - 1] = sumep - suemp;
    gapp[1][ipo95 - 1] = (sumep + suemp) * uim;
    gapp[2][ipo95 - 1] = sueep;
  } // ipo95 loop
  // Clean memory
  delete[] vec_ac;
@@ -165,23 +166,29 @@ void apcra(
) {
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  // dcomplex uimtl, uimtls, ca11, ca12, ca21, ca22;
  // dcomplex a11, a12, a21, a22, fc;
  double *vec_svw = new double[le * 12]();
  dcomplex *vec_svs = new dcomplex[le * 12]();
  dcomplex sum1 = cc0;
  dcomplex sum2 = cc0;
  dcomplex uimtl, uimtls, ca11, ca12, ca21, ca22;
  dcomplex a11, a12, a21, a22, sum1, sum2, fc;
  double ****svw = new double***[le];
  dcomplex ****svs = new dcomplex***[le];
  for (int i = 0; i < le; i++) {
    svw[i] = new double**[3];
    svs[i] = new dcomplex**[3];
    for (int j = 0; j < 3; j++) {
      svw[i][j] = new double*[2];
      svs[i][j] = new dcomplex*[2];
      for (int k = 0; k < 2; k++) {
	svw[i][j][k] = new double[2]();
	svs[i][j][k] = new dcomplex[2]();
      }
    }
  }
  int nlem = le * (le + 2);
#pragma omp parallel
  {
#pragma omp for
  for (int l28 = 1; l28 <= le; l28++) {
    int lpo = l28 + 1;
    int ltpo = lpo + l28;
    double fl = sqrt(1.0 * ltpo);
    for (int ilmp = 1; ilmp <= 3; ilmp++) {
      if ((l28 == 1 && ilmp == 1) || (l28 == le && ilmp == 3)) continue; // ilmp loop
	const int svw_start = 12 * (l28 - 1) + 4 * (ilmp - 1);
      int lmpml = ilmp - 2;
      int lmp = l28 + lmpml;
      double flmp = sqrt(1.0 * (lmp + lmp + 1));
@@ -191,40 +198,38 @@ void apcra(
      if (inpol == 0) {
	double cgs = cgmpo + cgmmo;
	double cgd = cgmpo - cgmmo;
	  vec_svw[svw_start] = cgs;
	  vec_svw[svw_start + 1] = cgd;
	  vec_svw[svw_start + 2] = cgd;
	  vec_svw[svw_start + 3] = cgs;
	svw[l28 - 1][ilmp - 1][0][0] = cgs;
	svw[l28 - 1][ilmp - 1][0][1] = cgd;
	svw[l28 - 1][ilmp - 1][1][0] = cgd;
	svw[l28 - 1][ilmp - 1][1][1] = cgs;
      } else { // label 22
	  vec_svw[svw_start] = cgmpo;
	  vec_svw[svw_start + 2] = cgmpo;
	  vec_svw[svw_start + 1] = -cgmmo;
	  vec_svw[svw_start + 3] = cgmmo;
	svw[l28 - 1][ilmp - 1][0][0] = cgmpo;
	svw[l28 - 1][ilmp - 1][1][0] = cgmpo;
	svw[l28 - 1][ilmp - 1][0][1] = -cgmmo;
	svw[l28 - 1][ilmp - 1][1][1] = cgmmo;
      }
      // label 26
    } // ilmp loop
    } // l28 parallel loop
    // for (int l30 = 1; l30 <= le; l30++) { // 0-init: can be omitted
    //   for (int ilmp = 1; ilmp <= 3; ilmp++) {
    //     for (int ipa = 1; ipa <= 2; ipa++) {
    // 	for (int ipamp = 1; ipamp <= 2; ipamp++) {
    // 	  vec_svs[12 * (l30 - 1) + 4 * (ilmp - 1) + 2 * (ipa - 1) + ipamp - 1] = cc0;
    // 	}
    //     } // ipa loop
    //   } // ilmp loop
    // } // l30 loop
#pragma omp for reduction(+: vec_svs[0:12*le])
  } // l28 loop
  for (int l30 = 1; l30 <= le; l30++) { // 0-init: can be omitted
    for (int ilmp = 1; ilmp <= 3; ilmp++) {
      for (int ipa = 1; ipa <= 2; ipa++) {
	for (int ipamp = 1; ipamp <= 2; ipamp++) {
	  svs[l30 - 1][ilmp - 1][ipa - 1][ipamp - 1] = cc0;
	}
      } // ipa loop
    } // ilmp loop
  } // l30 loop
  for (int l58 = 1; l58 <= le; l58 ++) {
    int lpo = l58 + 1;
    int ltpo = l58 + lpo;
    int imm = l58 * lpo;
    for (int ilmp = 1; ilmp <= 3; ilmp++) {
      if ((l58 == 1 && ilmp == 1) || (l58 == le && ilmp == 3)) continue; // ilmp loop
	const int svs_start = 12 * (l58 - 1) + 4 * (ilmp - 1);
      int lmpml = ilmp - 2;
      int lmp = l58 + lmpml;
      int impmm = lmp * (lmp + 1);
	dcomplex uimtl = uim * (1.0 * lmpml);
      uimtl = uim * (1.0 * lmpml);
      if (lmpml == 0) uimtl = 1.0 + 0.0 * I;
      for (int im54 = 1; im54 <= ltpo; im54++) {
	int m = im54 - lpo;
@@ -247,7 +252,7 @@ void apcra(
		int lsmpml = ilsmp - 2;
		int lsmp = ls + lsmpml;
		int ismpmm = lsmp * (lsmp + 1);
		  dcomplex uimtls = -uim * (1.0 * lsmpml);
		uimtls = -uim * (1.0 * lsmpml);
		if (lsmpml == 0) uimtls = 1.0 + 0.0 * I;
		for (int ims = 1; ims <= lstpo; ims++) {
		  int ms = ims - lspo;
@@ -259,43 +264,35 @@ void apcra(
		    int ismp = ismpmm + msmp;
		    int ismpe = ismp + nlem;
		    double cgcs = cg1(lsmpml, mu, ls, ms);
		      dcomplex fc = (uimtl * uimtls) * (cgc * cgcs);
		      dcomplex ca11 = dconjg(am0m[is - 1][i - 1]);
		      dcomplex ca12 = dconjg(am0m[is - 1][ie - 1]);
		      dcomplex ca21 = dconjg(am0m[ise - 1][i - 1]);
		      dcomplex ca22 = dconjg(am0m[ise - 1][ie - 1]);
		      dcomplex a11 = am0m[ismp - 1][imp - 1];
		      dcomplex a12 = am0m[ismp - 1][impe - 1];
		      dcomplex a21 = am0m[ismpe - 1][imp - 1];
		      dcomplex a22 = am0m[ismpe - 1][impe - 1];
		    fc = (uimtl * uimtls) * (cgc * cgcs);
		    ca11 = dconjg(am0m[is - 1][i - 1]);
		    ca12 = dconjg(am0m[is - 1][ie - 1]);
		    ca21 = dconjg(am0m[ise - 1][i - 1]);
		    ca22 = dconjg(am0m[ise - 1][ie - 1]);
		    a11 = am0m[ismp - 1][imp - 1];
		    a12 = am0m[ismp - 1][impe - 1];
		    a21 = am0m[ismpe - 1][imp - 1];
		    a22 = am0m[ismpe - 1][impe - 1];
		    double z11 = zpv[ls - 1][ilsmp - 1][0][0];
		    double z12 = zpv[ls - 1][ilsmp - 1][0][1];
		    double z21 = zpv[ls - 1][ilsmp - 1][1][0];
		    double z22 = zpv[ls - 1][ilsmp - 1][1][1];
		      vec_svs[svs_start] += (
		        (
		           ca11 * a11 * z11 + ca11 * a21 * z12
			   + ca21 * a11 * z21 + ca21 * a21 * z22
		        ) * fc
		      );
		      vec_svs[svs_start + 1] += (
		        (
		          ca11 * a12 * z11 + ca11 * a22 * z12
			  + ca21 * a12 * z21 + ca21 * a22 * z22
		        ) * fc
		      );
		      vec_svs[svs_start + 2] += (
		        (
		          ca12 * a11 * z11 + ca12 * a21 * z12
			  + ca22 * a11 * z21 + ca22 * a21 * z22
		        ) * fc
		      );
		      vec_svs[svs_start + 3] += (
		        (
		          ca12 * a12 * z11 + ca12 * a22 * z12
			  + ca22 * a12 * z21 + ca22 * a22 * z22
		        ) * fc
	              );
		    svs[l58 - 1][ilmp - 1][0][0] += ((ca11 * a11 * z11
						      + ca11 * a21 * z12
						      + ca21 * a11 * z21
						      + ca21 * a21 * z22) * fc);
		    svs[l58 - 1][ilmp - 1][0][1] += ((ca11 * a12 * z11
						      + ca11 * a22 * z12
						      + ca21 * a12 * z21
						      + ca21 * a22 * z22) * fc);
		    svs[l58 - 1][ilmp - 1][1][0] += ((ca12 * a11 * z11
						      + ca12 * a21 * z12
						      + ca22 * a11 * z21
						      + ca22 * a21 * z22) * fc);
		    svs[l58 - 1][ilmp - 1][1][1] += ((ca12 * a12 * z11
						      + ca12 * a22 * z12
						      + ca22 * a12 * z21
						      + ca22 * a22 * z22) * fc);
		  } // ends ims loop
		} // ims loop
	      } // ilsmp loop
@@ -305,7 +302,8 @@ void apcra(
      } // im54 loop
    } // ilmp loop
  } // l58 loop
#pragma omp for reduction(+: sum1, sum2)
  sum1 = cc0;
  sum2 = cc0;
  for (int l68 = 1; l68 <= le; l68++) {
    //int lpo = l68 + 1;
    //int ltpo = l68 + lpo;
@@ -314,34 +312,33 @@ void apcra(
      if ((l68 == 1 && ilmp == 1) || (l68 == le && ilmp == 3)) continue; // ilmp loop
      if (inpol == 0) {
	sum1 += (
	    vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][1][1]
		 );
	sum2 += (
	    vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][1][1]
		 );
      } else { // label 62
	sum1 += (
	    vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1)] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 2] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][0][0] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][1][0] * svs[l68 - 1][ilmp - 1][1][1]
		 );
	sum2 += (
	    vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1)]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 1]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 1] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 2]
	    + vec_svw[12 * (l68 - 1) + 4 * (ilmp - 1) + 3] * vec_svs[12 * (l68 - 1) + 4 * (ilmp - 1) + 3]
		 svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][0][0]
		 + svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][0][1]
		 + svw[l68 - 1][ilmp - 1][0][1] * svs[l68 - 1][ilmp - 1][1][0]
		 + svw[l68 - 1][ilmp - 1][1][1] * svs[l68 - 1][ilmp - 1][1][1]
		 );
      } // label 66, ends ilmp loop
    } // ilmp loop
  } // l68 loop
  } // OMP parallel region
  const double half_pi = acos(0.0);
  double cofs = half_pi * 2.0 / sqk;
  gaprm[0][0] = 0.0;
@@ -366,14 +363,24 @@ void apcra(
    gappm[2][0] = cc0;
    gappm[2][1] = cc0;
  }
  
  // Clean memory
  delete[] vec_svw;
  delete[] vec_svs;
  for (int i = le - 1; i > -1; i--) {
    for (int j = 2; j > -1; j--) {
      for (int k = 1; k > -1; k--) {
	delete[] svw[i][j][k];
	delete[] svs[i][j][k];
      }
      delete[] svw[i][j];
      delete[] svs[i][j];
    }
    delete[] svw[i];
    delete[] svs[i];
  }
  delete[] svw;
  delete[] svs;
}

#ifdef USE_TARGET_OFFLOAD
#pragma omp begin declare target device_type(any)
#endif
dcomplex cdtp(dcomplex z, dcomplex *vec_am, int i, int jf, int k, int nj, np_int istep) {
  dcomplex result = z;
  if (nj > 0) {
@@ -384,9 +391,6 @@ dcomplex cdtp(dcomplex z, dcomplex *vec_am, int i, int jf, int k, int nj, np_int
  }
  return result;
}
#ifdef USE_TARGET_OFFLOAD
#pragma omp end declare target
#endif

// #ifdef USE_TARGET_OFFLOAD
// #pragma omp begin declare target device_type(any)
@@ -1134,7 +1138,7 @@ void hjv(
  delete[] rfn;
}

int lucin(dcomplex *vec_am, const np_int nddmst, np_int n) {
void lucin(dcomplex **am, const np_int nddmst, np_int n, int &ier) {
  /* NDDMST  FIRST DIMENSION OF AM AS DECLARED IN DIMENSION
   *         STATEMENT.
   * N       NUMBER OF ROWS IN AM.
@@ -1142,24 +1146,22 @@ int lucin(dcomplex *vec_am, const np_int nddmst, np_int n) {
   */
  double *v = new double[nddmst];
  double *vi = new double[nddmst]();
  dcomplex *vec_am = am[0];
  const dcomplex cc0 = 0.0 + 0.0 * I;
  int ier = 0;
  ier = 0;
  int nminus = n - 1;
  const np_int nn = n * n;
#pragma omp parallel
  {
#pragma omp for reduction(+: vi[0:nddmst])
    for (np_int ij = 0; ij < nn; ij++) {
      np_int i = ij / n;
      np_int j = ij % n;
      vi[i] += (
        real(vec_am[i * n + j]) * real(vec_am[i * n + j])
        + imag(vec_am[i * n + j]) * imag(vec_am[i * n + j])
#pragma omp parallel for
  for (int64_t i = 1; i <= n; i++) {
#pragma omp parallel for reduction(+: vi[i - 1])
    for (int64_t j = 1; j <= n; j++) {
      vi[i - 1] += (
	real(vec_am[(i - 1) * n + j - 1]) * real(vec_am[(i - 1) * n + j - 1])
	+ imag(vec_am[(i - 1) * n + j - 1]) * imag(vec_am[(i - 1) * n + j - 1])
      );
    }
#pragma omp for
    for (np_int i = 0; i < n; i++) v[i] = 1.0 / vi[i];
  } // OMP parallel region
    } // j1319 loop
    v[i - 1] = 1.0 / vi[i - 1];
  } // i1309 loop
  delete[] vi;
  // 2. REPLACE AM BY TRIANGULAR MATRICES (L,U) WHERE AM=L*U.
  //    REPLACE L(I,I) BY 1/L(I,I), READY FOR SECTION 4.
  //    (ROW INTERCHANGES TAKE PLACE, AND THE INDICES OF THE PIVOTAL ROWS
@@ -1193,11 +1195,13 @@ int lucin(dcomplex *vec_am, const np_int nddmst, np_int n) {
    v[k - 1] = 1.0 * l;
    if (psqmax == 0.0) {
      ier = 1;
      // return ier;
      delete[] v;
      return;
    }
    dcomplex ctemp = 1.0 / vec_am[(k - 1) * n + k - 1];
    vec_am[(k - 1) * n + k - 1] = ctemp;
    if (kplus <= n) {
#pragma omp parallel for
      for (int64_t j = kplus; j <= n; j++) {
	dcomplex cfun = cdtp(-vec_am[(k - 1) * n + j - 1], vec_am, k, 1, j, kminus, n);
	vec_am[(k - 1) * n + j - 1] = -ctemp * cfun;
@@ -1242,9 +1246,7 @@ int lucin(dcomplex *vec_am, const np_int nddmst, np_int n) {
      } // i4319 loop
    }
  } // l4309 loop
  delete[] vi;
  delete[] v;
  return ier;
}

void mextc(double vk, double exri, dcomplex **fsac, double **cextlr, double **cext) {