Commit dacf2315 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use local rac3j vector in ghit()

parent 56aa91db
Loading
Loading
Loading
Loading
+0 −20
Original line number Diff line number Diff line
@@ -127,26 +127,6 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1);
 */
void crsm1(double vk, double exri, ParticleDescriptor *c1);

/*! \brief Compute the transfer vector from N2 to N1.
 *
 * This function computes the transfer vector going from N2 to N1, using either
 * Hankel, Bessel or Bessel from origin functions.
 *
 * \param ihi: `int`
 * \param ipamo: `int`
 * \param nbl: `int`
 * \param l1: `int`
 * \param m1: `int`
 * \param l2: `int`
 * \param m2: `int`
 * \param c1: `ParticleDescriptor *`
 * \param rac3j: `dcomplex *`
 */
dcomplex ghit_d(
	      int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	      ParticleDescriptor *c1, double *rac3j
);

/*! \brief Compute the transfer vector from N2 to N1.
 *
 * This function computes the transfer vector going from N2 to N1, using either
+27 −244
Original line number Diff line number Diff line
@@ -19,6 +19,8 @@
 * \brief C++ implementation of CLUSTER subroutines.
 */

#include <cstring>

#ifndef INCLUDE_TYPES_H_
#include "../include/types.h"
#endif
@@ -53,6 +55,8 @@

using namespace std;

const double four_pi = 8.0 * acos(0.0);

void apc(
  double ****zpv, int le, dcomplex **am0m, dcomplex **w,
  double sqk, double **gapr, dcomplex **gapp
@@ -596,15 +600,12 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1) {
    int j2e = (is_valid_iter) ? in1 + ilm2e : 0;
    // End of index 2 magnetic quantum numbers
    dcomplex cgh, cgk;
    #pragma omp critical
    {
    cgh = (is_valid_iter) ?
      ghit(0, 0, nbl, l1, m1, l2, m2, c1) :
      cc0;
    cgk = (is_valid_iter) ?
      ghit(0, 1, nbl, l1, m1, l2, m2, c1) :
      cc0;
    }
    if (is_valid_iter) {
      am[i1 - 1][i2 - 1] = cgh;
      am[i1 - 1][i2e - 1] = cgk;
@@ -770,226 +771,6 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) {
  delete[] svs;
}

// #ifdef USE_TARGET_OFFLOAD
// #pragma omp begin declare target device_type(any)
// #endif
dcomplex ghit_d(
	      int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	      ParticleDescriptor *c1, double *rac3j
) {
  /* NBL identifies transfer vector going from N2 to N1;
   * IHI=0 for Hankel, IHI=1 for Bessel, IHI=2 for Bessel from origin;
   * depending on IHI, IPAM=0 gives H or I, IPAM= 1 gives K or L. */
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex uim = 0.0 + 1.0 * I;
  dcomplex csum = cc0, cfun = cc0;
  dcomplex result = cc0;

  if (ihi == 2) {
    if (c1->rxx[nbl - 1] == 0.0 && c1->ryy[nbl - 1] == 0.0 && c1->rzz[nbl - 1] == 0.0) {
      if (ipamo == 0) {
	if (l1 == l2 && m1 == m2) result = 1.0 + 0.0 * I;
      }
      return result;
    }
  }
  // label 10
  int l1mp = l1 - ipamo;
  int l1po = l1 + 1;
  int m1mm2 = m1 - m2;
  int m1mm2m = (m1mm2 > 0) ? m1mm2 + 1 : 1 - m1mm2;
  int lminpo = (l2 - l1mp > 0) ? l2 - l1mp + 1 : l1mp - l2 + 1;
  int lmaxpo = l2 + l1mp + 1;
  int i3j0in = c1->ind3j[l1mp][l2 - 1];
  int ilin = -1;
  if (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ilin = 0;
  int isn = 1;
  if (m1 % 2 != 0) isn *= -1;
  if (lminpo % 2 == 0) {
    isn *= -1;
    if (l2 > l1mp) isn *= -1;
  }
  // label 12
  int nblmo = nbl - 1;
  if (ihi != 2) {
    int nbhj = nblmo * c1->litpo;
    int nby = nblmo * c1->litpos;
    if (ihi != 1) {
      for (int jm24 = 1; jm24 <= 3; jm24++) {
	csum = cc0;
	int mu = jm24 - 2;
	int mupm1 = mu + m1;
	int mupm2 = mu + m2;
	if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) {
	  int jsn = -isn;
	  if (mu == 0) jsn = isn;
	  double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2);
	  int i3j0 = i3j0in;
	  if (mupm1 == 0 && mupm2 == 0) {
	    int lt14 = lminpo;
	    while (lt14 <= lmaxpo) {
	      i3j0++;
	      int l3 = lt14 - 1;
	      int ny = l3 * l3 + lt14;
	      double aors = 1.0 * (l3 + lt14);
	      double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
	      cfun = (c1->vh[nbhj + lt14 - 1] * c1->vyhj[nby + ny - 1]) * f3j;
	      csum += cfun;
	      jsn *= -1;
	      lt14 += 2;
	    }
	    // goes to 22
	  } else { // label 16
	    r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j);
	    int il = ilin;
	    int lt20 = lminpo;
	    while (lt20 <= lmaxpo) {
	      i3j0++;
	      if (m1mm2m <= lt20) {
		il += 2;
		int l3 = lt20 - 1;
		int ny = l3 * l3  + lt20 + m1mm2;
		double aors = 1.0 * (l3 + lt20);
		double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
		cfun = (c1->vh[nbhj + lt20 - 1] * c1->vyhj[nby + ny - 1]) * f3j;
		csum += cfun;
	      }
	      // label 20
	      jsn *= -1;
	      lt20 += 2;
	    }
	  }
	  // label 22
	  csum *= cr;
	  result += csum;
	}
	// Otherwise there is nothing to add
      } // jm24 loop. Should go to 70
    } else { // label 30, IHI == 1
      for (int jm44 = 1; jm44 <= 3; jm44++) {
	csum = cc0;
	int mu = jm44 - 2;
	int mupm1 = mu + m1;
	int mupm2 = mu + m2;
	if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) {
	  int jsn = - isn;
	  if (mu == 0) jsn = isn;
	  double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2);
	  int i3j0 = i3j0in;
	  if (mupm1 == 0 && mupm2 == 0) {
	    int lt34 = lminpo;
	    while (lt34 <= lmaxpo) {
	      i3j0++;
	      int l3 = lt34 - 1;
	      int ny = l3 * l3 + lt34;
	      double aors = 1.0 * (l3 + lt34);
	      double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
	      cfun = (c1->vh[nbhj + lt34 - 1] * c1->vyhj[nby + ny - 1]) * f3j;
	      csum += cfun;
	      jsn *= -1;
	      lt34 += 2;
	    }
	    // goes to 42
	  } else { // label 36
	    r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j);
	    int il = ilin;
	    int lt40 = lminpo;
	    while (lt40 <= lmaxpo) {
	      i3j0++;
	      if (m1mm2m <= lt40) {
		il += 2;
		int l3 = lt40 - 1;
		int ny = l3 * l3  + lt40 + m1mm2;
		double aors = 1.0 * (l3 + lt40);
		double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
		cfun = (c1->vh[nbhj + lt40 - 1] * c1->vyhj[nby + ny - 1]) * f3j;
		csum += cfun;
	      }
	      // label 40
	      jsn *= -1;
	      lt40 += 2;
	    }
	  }
	  // label 42
	  csum *= cr;
	  result += csum;
	}
	// Otherwise there is nothing to add
      } // jm44 loop. Should go to 70
    }
    // goes to 70
  } else { // label 50, IHI == 2
    int nbhj = nblmo * c1->lmtpo;
    int nby = nblmo * c1->lmtpos;
    for (int jm64 = 1; jm64 <= 3; jm64++) {
      csum = cc0;
      int mu = jm64 - 2;
      int mupm1 = mu + m1;
      int mupm2 = mu + m2;
      if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) {
	int jsn = -isn;
	if (mu == 0) jsn = isn;
	double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2);
	int i3j0 = i3j0in;
	if (mupm1 == 0 && mupm2 == 0) {
	  int lt54 = lminpo;
	  while (lt54 <= lmaxpo) {
	    i3j0++;
	    int l3 = lt54 - 1;
	    int ny = l3 * l3 + lt54;
	    double aors = 1.0 * (l3 + lt54);
	    double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
	    cfun = (c1->vj0[nbhj + lt54 - 1] * c1->vyj0[nby + ny - 1]) * f3j;
	    csum += cfun;
	    jsn *= -1;
	    lt54 += 2;
	  }
	  // goes to 62
	} else { // label 56
	  r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j);
	  int il = ilin;
	  int lt60 = lminpo;
	  while (lt60 <= lmaxpo) {
	    i3j0++;
	    if (m1mm2m <= lt60) {
	      il += 2;
	      int l3 = lt60 - 1;
	      int ny = l3 * l3  + lt60 + m1mm2;
	      double aors = 1.0 * (l3 + lt60);
	      double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
	      cfun = (c1->vj0[nbhj + lt60 - 1] * c1->vyj0[nby + ny - 1]) * f3j;
	      csum += cfun;
	    }
	    // label 60
	    jsn *= -1;
	    lt60 += 2;
	  }
	}
	// label 62
	csum *= cr;
	result += csum;
      }
      // Otherwise there is nothing to add
    } // jm64 loop. Should go to 70
  }
  // label 70
  const double four_pi = acos(0.0) * 8.0;
  if (ipamo != 1) {
    double cr = sqrt(four_pi * (l1 + l1po) * (l2 + l2 + 1));
    result *= cr;
  } else {
    double cr = sqrt(four_pi * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po);
    result *= (cr * uim);
  }
  return result;
}
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp end declare target
// #endif

// #ifdef USE_TARGET_OFFLOAD
// #pragma omp begin declare target device_type(any)
// #endif
dcomplex ghit(
	      int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	      ParticleDescriptor *c1
@@ -1019,6 +800,9 @@ dcomplex ghit(
  int lmaxpo = l2 + l1mp + 1;
  int i3j0in = c1->ind3j[l1mp][l2 - 1];
  int ilin = -1;
  const int lmtpo = c1->lmtpo;
  double *rac3j = new double[lmtpo];
  memcpy(rac3j, c1->rac3j, lmtpo * sizeof(double));
  if (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ilin = 0;
  int isn = 1;
  if (m1 % 2 != 0) isn *= -1;
@@ -1057,7 +841,7 @@ dcomplex ghit(
	    }
	    // goes to 22
	  } else { // label 16
	    r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j);
	    r3jjr(l1mp, l2, -mupm1, mupm2, rac3j);
	    int il = ilin;
	    int lt20 = lminpo;
	    while (lt20 <= lmaxpo) {
@@ -1067,7 +851,7 @@ dcomplex ghit(
		int l3 = lt20 - 1;
		int ny = l3 * l3  + lt20 + m1mm2;
		double aors = 1.0 * (l3 + lt20);
		double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
		double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
		cfun = (c1->vh[nbhj + lt20 - 1] * c1->vyhj[nby + ny - 1]) * f3j;
		csum += cfun;
	      }
@@ -1108,7 +892,7 @@ dcomplex ghit(
	    }
	    // goes to 42
	  } else { // label 36
	    r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j);
	    r3jjr(l1mp, l2, -mupm1, mupm2, rac3j);
	    int il = ilin;
	    int lt40 = lminpo;
	    while (lt40 <= lmaxpo) {
@@ -1118,7 +902,7 @@ dcomplex ghit(
		int l3 = lt40 - 1;
		int ny = l3 * l3  + lt40 + m1mm2;
		double aors = 1.0 * (l3 + lt40);
		double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
		double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
		cfun = (c1->vj * c1->vyhj[nby + ny - 1]) * f3j;
		csum += cfun;
	      }
@@ -1163,7 +947,7 @@ dcomplex ghit(
	  }
	  // goes to 62
	} else { // label 56
	  r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j);
	  r3jjr(l1mp, l2, -mupm1, mupm2, rac3j);
	  int il = ilin;
	  int lt60 = lminpo;
	  while (lt60 <= lmaxpo) {
@@ -1173,7 +957,7 @@ dcomplex ghit(
	      int l3 = lt60 - 1;
	      int ny = l3 * l3  + lt60 + m1mm2;
	      double aors = 1.0 * (l3 + lt60);
	      double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
	      double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn;
	      cfun = (c1->vj0[nbhj + lt60 - 1] * c1->vyj0[nby + ny - 1]) * f3j;
	      csum += cfun;
	    }
@@ -1190,7 +974,6 @@ dcomplex ghit(
    } // jm64 loop. Should go to 70
  }
  // label 70
  const double four_pi = acos(0.0) * 8.0;
  if (ipamo != 1) {
    double cr = sqrt(four_pi * (l1 + l1po) * (l2 + l2 + 1));
    result *= cr;
@@ -1198,11 +981,9 @@ dcomplex ghit(
    double cr = sqrt(four_pi * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po);
    result *= (cr * uim);
  }
  delete[] rac3j;
  return result;
}
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp end declare target
// #endif

void hjv(
	 double exri, double vk, int &jer, int &lcalc, dcomplex &arg,
@@ -2604,7 +2385,7 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
// #else
// #pragma omp parallel for simd collapse(3)
// #endif
#pragma omp parallel for simd collapse(3)
// #pragma omp parallel for simd collapse(3)
  for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
    for (int k2 = 1; k2<=k2max; k2++) {
      for (int k3 = 1; k3<=k3max; k3++) {
@@ -2635,10 +2416,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
	int i3 = l3 * l3 + im3 - 1;
	int m3 = -l3 - 1 + im3;
	int vecindex = (i2 - 1) * c1->nlem + i3 - 1;
	double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
	gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local);
	gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local);
	free(rac3j_local);
	// double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
	// gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local);
	// gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local);
	// free(rac3j_local);
	gis_v[vecindex] = ghit(2, 0, n2, l2, m2, l3, m3, c1);
	gls_v[vecindex] = ghit(2, 1, n2, l2, m2, l3, m3, c1);
      } // close k3 loop, former l3 + im3 loops
    } // close k2 loop, former l2 + im2 loops
  } // close n2 loop