Loading src/include/clu_subs.h +0 −20 Original line number Diff line number Diff line Loading @@ -127,26 +127,6 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1); */ void crsm1(double vk, double exri, ParticleDescriptor *c1); /*! \brief Compute the transfer vector from N2 to N1. * * This function computes the transfer vector going from N2 to N1, using either * Hankel, Bessel or Bessel from origin functions. * * \param ihi: `int` * \param ipamo: `int` * \param nbl: `int` * \param l1: `int` * \param m1: `int` * \param l2: `int` * \param m2: `int` * \param c1: `ParticleDescriptor *` * \param rac3j: `dcomplex *` */ dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j ); /*! \brief Compute the transfer vector from N2 to N1. * * This function computes the transfer vector going from N2 to N1, using either Loading src/libnptm/clu_subs.cpp +27 −244 Original line number Diff line number Diff line Loading @@ -19,6 +19,8 @@ * \brief C++ implementation of CLUSTER subroutines. */ #include <cstring> #ifndef INCLUDE_TYPES_H_ #include "../include/types.h" #endif Loading Loading @@ -53,6 +55,8 @@ using namespace std; const double four_pi = 8.0 * acos(0.0); void apc( double ****zpv, int le, dcomplex **am0m, dcomplex **w, double sqk, double **gapr, dcomplex **gapp Loading Loading @@ -596,15 +600,12 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1) { int j2e = (is_valid_iter) ? in1 + ilm2e : 0; // End of index 2 magnetic quantum numbers dcomplex cgh, cgk; #pragma omp critical { cgh = (is_valid_iter) ? ghit(0, 0, nbl, l1, m1, l2, m2, c1) : cc0; cgk = (is_valid_iter) ? ghit(0, 1, nbl, l1, m1, l2, m2, c1) : cc0; } if (is_valid_iter) { am[i1 - 1][i2 - 1] = cgh; am[i1 - 1][i2e - 1] = cgk; Loading Loading @@ -770,226 +771,6 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; } // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j ) { /* NBL identifies transfer vector going from N2 to N1; * IHI=0 for Hankel, IHI=1 for Bessel, IHI=2 for Bessel from origin; * depending on IHI, IPAM=0 gives H or I, IPAM= 1 gives K or L. */ const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex uim = 0.0 + 1.0 * I; dcomplex csum = cc0, cfun = cc0; dcomplex result = cc0; if (ihi == 2) { if (c1->rxx[nbl - 1] == 0.0 && c1->ryy[nbl - 1] == 0.0 && c1->rzz[nbl - 1] == 0.0) { if (ipamo == 0) { if (l1 == l2 && m1 == m2) result = 1.0 + 0.0 * I; } return result; } } // label 10 int l1mp = l1 - ipamo; int l1po = l1 + 1; int m1mm2 = m1 - m2; int m1mm2m = (m1mm2 > 0) ? m1mm2 + 1 : 1 - m1mm2; int lminpo = (l2 - l1mp > 0) ? l2 - l1mp + 1 : l1mp - l2 + 1; int lmaxpo = l2 + l1mp + 1; int i3j0in = c1->ind3j[l1mp][l2 - 1]; int ilin = -1; if (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ilin = 0; int isn = 1; if (m1 % 2 != 0) isn *= -1; if (lminpo % 2 == 0) { isn *= -1; if (l2 > l1mp) isn *= -1; } // label 12 int nblmo = nbl - 1; if (ihi != 2) { int nbhj = nblmo * c1->litpo; int nby = nblmo * c1->litpos; if (ihi != 1) { for (int jm24 = 1; jm24 <= 3; jm24++) { csum = cc0; int mu = jm24 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt14 = lminpo; while (lt14 <= lmaxpo) { i3j0++; int l3 = lt14 - 1; int ny = l3 * l3 + lt14; double aors = 1.0 * (l3 + lt14); double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt14 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; jsn *= -1; lt14 += 2; } // goes to 22 } else { // label 16 r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt20 = lminpo; while (lt20 <= lmaxpo) { i3j0++; if (m1mm2m <= lt20) { il += 2; int l3 = lt20 - 1; int ny = l3 * l3 + lt20 + m1mm2; double aors = 1.0 * (l3 + lt20); double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt20 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } // label 20 jsn *= -1; lt20 += 2; } } // label 22 csum *= cr; result += csum; } // Otherwise there is nothing to add } // jm24 loop. Should go to 70 } else { // label 30, IHI == 1 for (int jm44 = 1; jm44 <= 3; jm44++) { csum = cc0; int mu = jm44 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = - isn; if (mu == 0) jsn = isn; double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt34 = lminpo; while (lt34 <= lmaxpo) { i3j0++; int l3 = lt34 - 1; int ny = l3 * l3 + lt34; double aors = 1.0 * (l3 + lt34); double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt34 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; jsn *= -1; lt34 += 2; } // goes to 42 } else { // label 36 r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt40 = lminpo; while (lt40 <= lmaxpo) { i3j0++; if (m1mm2m <= lt40) { il += 2; int l3 = lt40 - 1; int ny = l3 * l3 + lt40 + m1mm2; double aors = 1.0 * (l3 + lt40); double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt40 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } // label 40 jsn *= -1; lt40 += 2; } } // label 42 csum *= cr; result += csum; } // Otherwise there is nothing to add } // jm44 loop. Should go to 70 } // goes to 70 } else { // label 50, IHI == 2 int nbhj = nblmo * c1->lmtpo; int nby = nblmo * c1->lmtpos; for (int jm64 = 1; jm64 <= 3; jm64++) { csum = cc0; int mu = jm64 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt54 = lminpo; while (lt54 <= lmaxpo) { i3j0++; int l3 = lt54 - 1; int ny = l3 * l3 + lt54; double aors = 1.0 * (l3 + lt54); double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj0[nbhj + lt54 - 1] * c1->vyj0[nby + ny - 1]) * f3j; csum += cfun; jsn *= -1; lt54 += 2; } // goes to 62 } else { // label 56 r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt60 = lminpo; while (lt60 <= lmaxpo) { i3j0++; if (m1mm2m <= lt60) { il += 2; int l3 = lt60 - 1; int ny = l3 * l3 + lt60 + m1mm2; double aors = 1.0 * (l3 + lt60); double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj0[nbhj + lt60 - 1] * c1->vyj0[nby + ny - 1]) * f3j; csum += cfun; } // label 60 jsn *= -1; lt60 += 2; } } // label 62 csum *= cr; result += csum; } // Otherwise there is nothing to add } // jm64 loop. Should go to 70 } // label 70 const double four_pi = acos(0.0) * 8.0; if (ipamo != 1) { double cr = sqrt(four_pi * (l1 + l1po) * (l2 + l2 + 1)); result *= cr; } else { double cr = sqrt(four_pi * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po); result *= (cr * uim); } return result; } // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 Loading Loading @@ -1019,6 +800,9 @@ dcomplex ghit( int lmaxpo = l2 + l1mp + 1; int i3j0in = c1->ind3j[l1mp][l2 - 1]; int ilin = -1; const int lmtpo = c1->lmtpo; double *rac3j = new double[lmtpo]; memcpy(rac3j, c1->rac3j, lmtpo * sizeof(double)); if (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ilin = 0; int isn = 1; if (m1 % 2 != 0) isn *= -1; Loading Loading @@ -1057,7 +841,7 @@ dcomplex ghit( } // goes to 22 } else { // label 16 r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j); r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt20 = lminpo; while (lt20 <= lmaxpo) { Loading @@ -1067,7 +851,7 @@ dcomplex ghit( int l3 = lt20 - 1; int ny = l3 * l3 + lt20 + m1mm2; double aors = 1.0 * (l3 + lt20); double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt20 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } Loading Loading @@ -1108,7 +892,7 @@ dcomplex ghit( } // goes to 42 } else { // label 36 r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j); r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt40 = lminpo; while (lt40 <= lmaxpo) { Loading @@ -1118,7 +902,7 @@ dcomplex ghit( int l3 = lt40 - 1; int ny = l3 * l3 + lt40 + m1mm2; double aors = 1.0 * (l3 + lt40); double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } Loading Loading @@ -1163,7 +947,7 @@ dcomplex ghit( } // goes to 62 } else { // label 56 r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j); r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt60 = lminpo; while (lt60 <= lmaxpo) { Loading @@ -1173,7 +957,7 @@ dcomplex ghit( int l3 = lt60 - 1; int ny = l3 * l3 + lt60 + m1mm2; double aors = 1.0 * (l3 + lt60); double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj0[nbhj + lt60 - 1] * c1->vyj0[nby + ny - 1]) * f3j; csum += cfun; } Loading @@ -1190,7 +974,6 @@ dcomplex ghit( } // jm64 loop. Should go to 70 } // label 70 const double four_pi = acos(0.0) * 8.0; if (ipamo != 1) { double cr = sqrt(four_pi * (l1 + l1po) * (l2 + l2 + 1)); result *= cr; Loading @@ -1198,11 +981,9 @@ dcomplex ghit( double cr = sqrt(four_pi * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po); result *= (cr * uim); } delete[] rac3j; return result; } // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -2604,7 +2385,7 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) // #pragma omp parallel for simd collapse(3) for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2635,10 +2416,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); free(rac3j_local); // double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); // gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); // gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); // free(rac3j_local); gis_v[vecindex] = ghit(2, 0, n2, l2, m2, l3, m3, c1); gls_v[vecindex] = ghit(2, 1, n2, l2, m2, l3, m3, c1); } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop Loading Loading
src/include/clu_subs.h +0 −20 Original line number Diff line number Diff line Loading @@ -127,26 +127,6 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1); */ void crsm1(double vk, double exri, ParticleDescriptor *c1); /*! \brief Compute the transfer vector from N2 to N1. * * This function computes the transfer vector going from N2 to N1, using either * Hankel, Bessel or Bessel from origin functions. * * \param ihi: `int` * \param ipamo: `int` * \param nbl: `int` * \param l1: `int` * \param m1: `int` * \param l2: `int` * \param m2: `int` * \param c1: `ParticleDescriptor *` * \param rac3j: `dcomplex *` */ dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j ); /*! \brief Compute the transfer vector from N2 to N1. * * This function computes the transfer vector going from N2 to N1, using either Loading
src/libnptm/clu_subs.cpp +27 −244 Original line number Diff line number Diff line Loading @@ -19,6 +19,8 @@ * \brief C++ implementation of CLUSTER subroutines. */ #include <cstring> #ifndef INCLUDE_TYPES_H_ #include "../include/types.h" #endif Loading Loading @@ -53,6 +55,8 @@ using namespace std; const double four_pi = 8.0 * acos(0.0); void apc( double ****zpv, int le, dcomplex **am0m, dcomplex **w, double sqk, double **gapr, dcomplex **gapp Loading Loading @@ -596,15 +600,12 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1) { int j2e = (is_valid_iter) ? in1 + ilm2e : 0; // End of index 2 magnetic quantum numbers dcomplex cgh, cgk; #pragma omp critical { cgh = (is_valid_iter) ? ghit(0, 0, nbl, l1, m1, l2, m2, c1) : cc0; cgk = (is_valid_iter) ? ghit(0, 1, nbl, l1, m1, l2, m2, c1) : cc0; } if (is_valid_iter) { am[i1 - 1][i2 - 1] = cgh; am[i1 - 1][i2e - 1] = cgk; Loading Loading @@ -770,226 +771,6 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; } // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j ) { /* NBL identifies transfer vector going from N2 to N1; * IHI=0 for Hankel, IHI=1 for Bessel, IHI=2 for Bessel from origin; * depending on IHI, IPAM=0 gives H or I, IPAM= 1 gives K or L. */ const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex uim = 0.0 + 1.0 * I; dcomplex csum = cc0, cfun = cc0; dcomplex result = cc0; if (ihi == 2) { if (c1->rxx[nbl - 1] == 0.0 && c1->ryy[nbl - 1] == 0.0 && c1->rzz[nbl - 1] == 0.0) { if (ipamo == 0) { if (l1 == l2 && m1 == m2) result = 1.0 + 0.0 * I; } return result; } } // label 10 int l1mp = l1 - ipamo; int l1po = l1 + 1; int m1mm2 = m1 - m2; int m1mm2m = (m1mm2 > 0) ? m1mm2 + 1 : 1 - m1mm2; int lminpo = (l2 - l1mp > 0) ? l2 - l1mp + 1 : l1mp - l2 + 1; int lmaxpo = l2 + l1mp + 1; int i3j0in = c1->ind3j[l1mp][l2 - 1]; int ilin = -1; if (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ilin = 0; int isn = 1; if (m1 % 2 != 0) isn *= -1; if (lminpo % 2 == 0) { isn *= -1; if (l2 > l1mp) isn *= -1; } // label 12 int nblmo = nbl - 1; if (ihi != 2) { int nbhj = nblmo * c1->litpo; int nby = nblmo * c1->litpos; if (ihi != 1) { for (int jm24 = 1; jm24 <= 3; jm24++) { csum = cc0; int mu = jm24 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt14 = lminpo; while (lt14 <= lmaxpo) { i3j0++; int l3 = lt14 - 1; int ny = l3 * l3 + lt14; double aors = 1.0 * (l3 + lt14); double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt14 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; jsn *= -1; lt14 += 2; } // goes to 22 } else { // label 16 r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt20 = lminpo; while (lt20 <= lmaxpo) { i3j0++; if (m1mm2m <= lt20) { il += 2; int l3 = lt20 - 1; int ny = l3 * l3 + lt20 + m1mm2; double aors = 1.0 * (l3 + lt20); double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt20 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } // label 20 jsn *= -1; lt20 += 2; } } // label 22 csum *= cr; result += csum; } // Otherwise there is nothing to add } // jm24 loop. Should go to 70 } else { // label 30, IHI == 1 for (int jm44 = 1; jm44 <= 3; jm44++) { csum = cc0; int mu = jm44 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = - isn; if (mu == 0) jsn = isn; double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt34 = lminpo; while (lt34 <= lmaxpo) { i3j0++; int l3 = lt34 - 1; int ny = l3 * l3 + lt34; double aors = 1.0 * (l3 + lt34); double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt34 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; jsn *= -1; lt34 += 2; } // goes to 42 } else { // label 36 r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt40 = lminpo; while (lt40 <= lmaxpo) { i3j0++; if (m1mm2m <= lt40) { il += 2; int l3 = lt40 - 1; int ny = l3 * l3 + lt40 + m1mm2; double aors = 1.0 * (l3 + lt40); double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt40 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } // label 40 jsn *= -1; lt40 += 2; } } // label 42 csum *= cr; result += csum; } // Otherwise there is nothing to add } // jm44 loop. Should go to 70 } // goes to 70 } else { // label 50, IHI == 2 int nbhj = nblmo * c1->lmtpo; int nby = nblmo * c1->lmtpos; for (int jm64 = 1; jm64 <= 3; jm64++) { csum = cc0; int mu = jm64 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt54 = lminpo; while (lt54 <= lmaxpo) { i3j0++; int l3 = lt54 - 1; int ny = l3 * l3 + lt54; double aors = 1.0 * (l3 + lt54); double f3j = (c1->v3j0[i3j0 - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj0[nbhj + lt54 - 1] * c1->vyj0[nby + ny - 1]) * f3j; csum += cfun; jsn *= -1; lt54 += 2; } // goes to 62 } else { // label 56 r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt60 = lminpo; while (lt60 <= lmaxpo) { i3j0++; if (m1mm2m <= lt60) { il += 2; int l3 = lt60 - 1; int ny = l3 * l3 + lt60 + m1mm2; double aors = 1.0 * (l3 + lt60); double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj0[nbhj + lt60 - 1] * c1->vyj0[nby + ny - 1]) * f3j; csum += cfun; } // label 60 jsn *= -1; lt60 += 2; } } // label 62 csum *= cr; result += csum; } // Otherwise there is nothing to add } // jm64 loop. Should go to 70 } // label 70 const double four_pi = acos(0.0) * 8.0; if (ipamo != 1) { double cr = sqrt(four_pi * (l1 + l1po) * (l2 + l2 + 1)); result *= cr; } else { double cr = sqrt(four_pi * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po); result *= (cr * uim); } return result; } // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 Loading Loading @@ -1019,6 +800,9 @@ dcomplex ghit( int lmaxpo = l2 + l1mp + 1; int i3j0in = c1->ind3j[l1mp][l2 - 1]; int ilin = -1; const int lmtpo = c1->lmtpo; double *rac3j = new double[lmtpo]; memcpy(rac3j, c1->rac3j, lmtpo * sizeof(double)); if (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ilin = 0; int isn = 1; if (m1 % 2 != 0) isn *= -1; Loading Loading @@ -1057,7 +841,7 @@ dcomplex ghit( } // goes to 22 } else { // label 16 r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j); r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt20 = lminpo; while (lt20 <= lmaxpo) { Loading @@ -1067,7 +851,7 @@ dcomplex ghit( int l3 = lt20 - 1; int ny = l3 * l3 + lt20 + m1mm2; double aors = 1.0 * (l3 + lt20); double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vh[nbhj + lt20 - 1] * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } Loading Loading @@ -1108,7 +892,7 @@ dcomplex ghit( } // goes to 42 } else { // label 36 r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j); r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt40 = lminpo; while (lt40 <= lmaxpo) { Loading @@ -1118,7 +902,7 @@ dcomplex ghit( int l3 = lt40 - 1; int ny = l3 * l3 + lt40 + m1mm2; double aors = 1.0 * (l3 + lt40); double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj * c1->vyhj[nby + ny - 1]) * f3j; csum += cfun; } Loading Loading @@ -1163,7 +947,7 @@ dcomplex ghit( } // goes to 62 } else { // label 56 r3jjr(l1mp, l2, -mupm1, mupm2, c1->rac3j); r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt60 = lminpo; while (lt60 <= lmaxpo) { Loading @@ -1173,7 +957,7 @@ dcomplex ghit( int l3 = lt60 - 1; int ny = l3 * l3 + lt60 + m1mm2; double aors = 1.0 * (l3 + lt60); double f3j = (c1->rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; double f3j = (rac3j[il - 1] * c1->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; cfun = (c1->vj0[nbhj + lt60 - 1] * c1->vyj0[nby + ny - 1]) * f3j; csum += cfun; } Loading @@ -1190,7 +974,6 @@ dcomplex ghit( } // jm64 loop. Should go to 70 } // label 70 const double four_pi = acos(0.0) * 8.0; if (ipamo != 1) { double cr = sqrt(four_pi * (l1 + l1po) * (l2 + l2 + 1)); result *= cr; Loading @@ -1198,11 +981,9 @@ dcomplex ghit( double cr = sqrt(four_pi * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po); result *= (cr * uim); } delete[] rac3j; return result; } // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -2604,7 +2385,7 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) // #pragma omp parallel for simd collapse(3) for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2635,10 +2416,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); free(rac3j_local); // double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); // gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); // gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); // free(rac3j_local); gis_v[vecindex] = ghit(2, 0, n2, l2, m2, l3, m3, c1); gls_v[vecindex] = ghit(2, 1, n2, l2, m2, l3, m3, c1); } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop Loading