Loading src/libnptm/magma_calls.cpp +447 −7 Original line number Diff line number Diff line Loading @@ -63,14 +63,69 @@ using namespace std; #define FOUR_PI 12.566370614359172 #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) inline magmaDoubleComplex magma_zadd(const magmaDoubleComplex& a, const magmaDoubleComplex& b) { return MAGMA_Z_MAKE(MAGMA_Z_REAL(a)+MAGMA_Z_REAL(b),MAGMA_Z_IMAG(a)+MAGMA_Z_IMAG(b)); } #pragma omp end declare target #pragma omp begin declare target device_type(any) inline magmaDoubleComplex magma_zprod(const magmaDoubleComplex& a, const magmaDoubleComplex& b) { return MAGMA_Z_MAKE( MAGMA_Z_REAL(a)*MAGMA_Z_REAL(b)-MAGMA_Z_IMAG(a)*MAGMA_Z_IMAG(b), MAGMA_Z_REAL(a)*MAGMA_Z_IMAG(b)+MAGMA_Z_IMAG(a)*MAGMA_Z_REAL(b) ); } #pragma omp end declare target #pragma omp begin declare target device_type(any) inline magmaDoubleComplex magma_zscale(const magmaDoubleComplex& a, const double& b) { return MAGMA_Z_MAKE(MAGMA_Z_REAL(a)*b,MAGMA_Z_IMAG(a)*b); } #pragma omp end declare target #pragma omp begin declare target device_type(any) double magma_cgev(int ipamo, int mu, int l, int m) { double xd, xn; if (ipamo == 0) { if (mu != 0) { xd = 2.0 * l * (l + 1); xn = (mu < 0) ? 1.0 * (l + m) * (l - m + 1) : 1.0 * (l - m) * (l + m + 1); double result = sqrt(xn / xd); return (mu <= 0) ? result : -result; } else { // mu == 0 return -1.0 * m / sqrt(1.0 * (l + 1) * l); } return 0.0; } else { // ipamo != 0 xd = 2.0 * l * (l * 2 - 1); xn = (mu == 0) ? 2.0 * (l - m) * (l + m) : ((mu < 0) ? 1.0 * (l - 1 + m) * (l + m) : 1.0 * (l - 1 - m) * (l - m)); return sqrt(xn / xd); } } #pragma omp end declare target magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int device_id) { const magmaDoubleComplex cz0 = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex cz1 = MAGMA_Z_MAKE(1.0, 0.0); const int nsph = c1->nsph; const int nsphmo = c1->nsph - 1; const int nsphmo = nsph - 1; const int ncou = nsph * nsphmo; const int li = c1->li; const int litpo = li + li + 1; const int litpos = litpo * litpo; const int le = c1->le; const int lmtpo = c1->lmtpo; const int lmtpos = c1->lmtpos; const int rsize = li * nsph; const int ind3j_size = (c1->lm + 1) * c1->lm; const int nv3j = c1->nv3j; const magma_int_t max_litpo = 2 * li + 1; const magma_int_t nlim = li * (li + 2); const magma_int_t ndi = nsph * nlim; Loading @@ -80,6 +135,16 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de const magma_int_t total_iters = num_pairs * li * max_litpo * li * max_litpo; magmaDoubleComplex *rmi = (magmaDoubleComplex *)(c1->rmi[0]); magmaDoubleComplex *rei = (magmaDoubleComplex *)(c1->rei[0]); magmaDoubleComplex vj = MAGMA_Z_MAKE(real(c1->vj), imag(c1->vj)); double *rxx = c1->rxx; double *ryy = c1->ryy; double *rzz = c1->rzz; int *ind3j = c1->ind3j[0]; double *v3j0 = c1->v3j0; magmaDoubleComplex *vh = (magmaDoubleComplex *)(c1->vh); magmaDoubleComplex *vyhj = (magmaDoubleComplex *)(c1->vyhj); magmaDoubleComplex *vj0 = (magmaDoubleComplex *)(c1->vj0); magmaDoubleComplex *vyj0 = (magmaDoubleComplex *)(c1->vyj0); /* THIS IS AN EXAMPLE OF WORKING ID INITIALIZATION #pragma omp target teams distribute parallel for \ Loading @@ -102,12 +167,15 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de } #pragma omp target teams distribute parallel for \ firstprivate(total_iters, max_litpo, num_pairs, li, ndi, ndit) \ firstprivate(total_iters, max_litpo, num_pairs, li, le, ndi, ndit, vj) \ map(to: lut_n1[0:num_pairs], lut_n2[0:num_pairs]) \ map(to: rxx[0:nsph], ryy[0:nsph], rzz[0:nsph]) \ map(to: ind3j[0:ind3j_size], v3j0[0:nv3j], vh[0:ncou*litpo]) \ map(to: vyhj[0:ncou*litpos], vj0[0:nsph*lmtpo], vyj0[0:nsph*lmtpos]) \ device(device_id) for (magma_int_t iter = 0; iter < total_iters; ++iter) { magma_int_t t = iter; // double rac3j[128]; double rac3j[MAX_GPU_STACK_DOUBLE]; // MAX_GPU_STACK_DOUBLE = 128 // Index calculation (from innermost to outermost) int im2 = (t % max_litpo) + 1; Loading Loading @@ -156,20 +224,28 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de // End of index 2 magnetic quantum numbers magmaDoubleComplex cgh, cgk, zvalue; cgh = (is_valid_iter) ? cz1 : // ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j) : // cz1 : // ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j) : magma_ghit<0>( 0, nbl, l1, m1, l2, m2, rxx, ryy, rzz, ind3j, v3j0, vh, vyhj, vj0, vyj0, vj, li, le, rac3j ) : cz0; cgk = (is_valid_iter) ? cz1 : // ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j) : // cz1 : // ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j) : magma_ghit<0>( 1, nbl, l1, m1, l2, m2, rxx, ryy, rzz, ind3j, v3j0, vh, vyhj, vj0, vyj0, vj, li, le, rac3j ) : cz0; if (is_valid_iter) { vec_am[(i1 - 1) * ndit + i2 - 1] = cgh; vec_am[(i1 - 1) * ndit + i2e - 1] = cgk; vec_am[(i1e - 1) * ndit + i2 - 1] = cgk; vec_am[(i1e - 1) * ndit + i2e - 1] = cgh; zvalue = MAGMA_Z_MUL(cgh, rsh); zvalue = magma_zprod(cgh, rsh); vec_am[(j1 - 1) * ndit + j2 - 1] = zvalue; vec_am[(j1e - 1) * ndit + j2e - 1] = zvalue; zvalue = MAGMA_Z_MUL(cgk, rsk); zvalue = magma_zprod(cgk, rsk); vec_am[(j1 - 1) * ndit + j2e - 1] = zvalue; vec_am[(j1e - 1) * ndit + j2 - 1] = zvalue; } Loading Loading @@ -209,6 +285,370 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de return MAGMA_SUCCESS; } #pragma omp begin declare target device_type(any) template<int IHI> magmaDoubleComplex magma_ghit( int ipamo, int nbl, int l1, int m1, int l2, int m2, double *rxx, double *ryy, double *rzz, int *ind3j, double *v3j0, magmaDoubleComplex *vh, magmaDoubleComplex *vyhj, magmaDoubleComplex *vj0, magmaDoubleComplex *vyj0, magmaDoubleComplex vj, int li, int le, double rac3j[] ) { /* NBL identifies transfer vector going from N2 to N1; * IHI=0 for Hankel, IHI=1 for Bessel, IHI=2 for Bessel from origin; * depending on IHI, IPAM=0 gives H or I, IPAM= 1 gives K or L. */ const magmaDoubleComplex cz0 = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex uim = MAGMA_Z_MAKE(0.0, 1.0); // label 10 int lm = (le > li) ? le : li; int l1mp = l1 - ipamo; int l1po = l1 + 1; int m1mm2 = m1 - m2; int m1mm2m = (m1mm2 > 0) ? m1mm2 + 1 : 1 - m1mm2; int lminpo = (l2 - l1mp > 0) ? l2 - l1mp + 1 : l1mp - l2 + 1; int lmaxpo = l2 + l1mp + 1; int i3j0in = ind3j[lm * l1mp + l2 - 1]; int ilin = (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ? 0 : -1; int isn = (m1 % 2 != 0) ? -1 : 1; if (lminpo % 2 == 0) { isn *= -1; if (l2 > l1mp) isn *= -1; } // label 12 int nblmo = nbl - 1; if constexpr (IHI == 0) { int litpo = li + li + 1; int litpos = litpo * litpo; int nbhj = nblmo * litpo; int nby = nblmo * litpos; magmaDoubleComplex result = cz0; for (int jm24 = 1; jm24 <= 3; jm24++) { magmaDoubleComplex csum = cz0; int mu = jm24 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt14 = lminpo; while (lt14 <= lmaxpo) { i3j0++; int l3 = lt14 - 1; int ny = l3 * l3 + lt14; double aors = 1.0 * (l3 + lt14); magmaDoubleComplex f3j = MAGMA_Z_MAKE((v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); // dcomplex cfun = (vh[nbhj + lt14 - 1] * vyhj[nby + ny - 1]) * f3j; magmaDoubleComplex cfun = magma_zprod(vh[nbhj + lt14 - 1], vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, fcfun); jsn *= -1; lt14 += 2; } // goes to 22 } else { // label 16 magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt20 = lminpo; while (lt20 <= lmaxpo) { i3j0++; if (m1mm2m <= lt20) { il += 2; int l3 = lt20 - 1; int ny = l3 * l3 + lt20 + m1mm2; double aors = 1.0 * (l3 + lt20); magmaDoubleComplex f3j = MAGMA_Z_MAKE((rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); // dcomplex cfun = (vh[nbhj + lt20 - 1] * vyhj[nby + ny - 1]) * f3j; magmaDoubleComplex cfun = magma_zprod(vh[nbhj + lt20 - 1], vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, cfun); } // label 20 jsn *= -1; lt20 += 2; } } // label 22 //csum *= cr; csum = magma_zscale(csum, cr); result = magma_zadd(result, csum); } // Otherwise there is nothing to add } // jm24 loop. Should go to 70 magmaDoubleComplex cr = (ipamo == 1) ? MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) : MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0); return magma_zprod(result, cr); } else if constexpr (IHI == 1) { int litpo = li + li + 1; int litpos = litpo * litpo; int nbhj = nblmo * litpo; int nby = nblmo * litpos; magmaDoubleComplex result = cz0; for (int jm44 = 1; jm44 <= 3; jm44++) { magmaDoubleComplex csum = cz0; int mu = jm44 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = - isn; if (mu == 0) jsn = isn; double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt34 = lminpo; while (lt34 <= lmaxpo) { i3j0++; int l3 = lt34 - 1; int ny = l3 * l3 + lt34; double aors = 1.0 * (l3 + lt34); magmaDoubleComplex f3j = MAGMA_Z_MAKE((v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj, vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, fcfun); jsn *= -1; lt34 += 2; } // goes to 42 } else { // label 36 magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt40 = lminpo; while (lt40 <= lmaxpo) { i3j0++; if (m1mm2m <= lt40) { il += 2; int l3 = lt40 - 1; int ny = l3 * l3 + lt40 + m1mm2; double aors = 1.0 * (l3 + lt40); magmaDoubleComplex f3j = MAGMA_Z_MAKE((rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj, vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, cfun); } // label 40 jsn *= -1; lt40 += 2; } } // label 42 csum = magma_zscale(csum, cr); result = magma_zadd(result, csum); } // Otherwise there is nothing to add } // jm44 loop. Should go to 70 magmaDoubleComplex cr = (ipamo == 1) ? MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) : MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0); return magma_zprod(result, cr); } else { // The project grants that IHI = 2 whenever it is neither 0 nor 1. int lmtpo = lm + lm + 1; int lmtpos = lmtpo * lmtpo; int nbhj = nblmo * lmtpo; int nby = nblmo * lmtpos; magmaDoubleComplex result = cz0; if (rxx[nbl - 1] == 0.0 && ryy[nbl - 1] == 0.0 && rzz[nbl - 1] == 0.0) { if (ipamo == 0) { if (l1 == l2 && m1 == m2) return MAGMA_Z_MAKE(1.0, 0.0); } return MAGMA_Z_MAKE(0.0, 0.0); } for (int jm64 = 1; jm64 <= 3; jm64++) { magmaDoubleComplex csum = cz0; int mu = jm64 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt54 = lminpo; while (lt54 <= lmaxpo) { i3j0++; int l3 = lt54 - 1; int ny = l3 * l3 + lt54; double aors = 1.0 * (l3 + lt54); magmaDoubleComplex f3j = MAGMA_Z_MAKE(v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj0[nbhj + lt54 - 1], vyj0[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, fcfun); jsn *= -1; lt54 += 2; } // goes to 62 } else { // label 56 magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt60 = lminpo; while (lt60 <= lmaxpo) { i3j0++; if (m1mm2m <= lt60) { il += 2; int l3 = lt60 - 1; int ny = l3 * l3 + lt60 + m1mm2; double aors = 1.0 * (l3 + lt60); magmaDoubleComplex f3j = MAGMA_Z_MAKE(rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj0[nbhj + lt60 - 1], vyj0[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, cfun); } // label 60 jsn *= -1; lt60 += 2; } } // label 62 csum = magma_zscale(csum, cr); result = magma_zadd(result, csum); } // Otherwise there is nothing to add } // jm64 loop. Should go to 70 magmaDoubleComplex cr = (ipamo == 1) ? MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) : MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0); return magma_zprod(result, cr); } } #pragma omp end declare target #pragma omp begin declare target device_type(any) /** * \brief Compute the 3j symbol for Clebsch-Gordan coefficients for JJ transitions. * * This function calculates the 3j(J,J2,J3;-M2-M3,M2,M3) symbol for the Clebsch-Gordan * coefficients. See Appendix a.3.1 in Borghese, Denti & Saija (2007). This function * is a specialized version to work on GPUs with MAGMA. * * \param j2: `int` Value of J2. [IN] * \param j3: `int` Value of J3. [IN] * \param m2: `int` Value of M2. [IN] * \param m3: `int` Value of M3. [IN] * \param rac3j: `double[]` Vector of 3j symbols. [OUT] */ void magma_r3jjr(int j2, int j3, int m2, int m3, double rac3j[]) { int jmx = j3 + j2; int jdf = j3 - j2; int m1 = -m2 - m3; int abs_jdf = (jdf >= 0) ? jdf : -jdf; int abs_m1 = (m1 >= 0) ? m1 : -m1; int jmn = (abs_jdf > abs_m1) ? abs_jdf : abs_m1; int njmo = jmx - jmn; int jf = jmx + jmx + 1; int isn = ((jdf + m1) % 2 == 0) ? 1 : -1; if (njmo <= 0) { double sj = 1.0 * jf; double cnr = (1.0 / sqrt(sj)) * isn; rac3j[0] = cnr; } else { // label 15 double sjt = 1.0; double sjr = 1.0 * jf; int jsmpos = (jmx + 1) * (jmx + 1); int jdfs = jdf * jdf; int m1s = m1 * m1; int mdf = m3 - m2; int idjc = m1 * (j3 * (j3 + 1) - j2 * (j2 +1)); int j1 = jmx; int j1s = j1 * j1; int j1po = j1 + 1; double ccj = 1.0 * (j1s - jdfs) * (j1s - m1s); double cj = sqrt(ccj * (jsmpos - j1s)); double dj = 1.0 * jf * (j1 * j1po * mdf + idjc); if (njmo <= 1) { rac3j[0] = -dj / (cj * j1po); double sj = sjr + (rac3j[0] * rac3j[0]) * (jf - 2); double cnr = (1.0 / sqrt(sj)) * isn; rac3j[1] = cnr; rac3j[0] *= cnr; } else { // label 20 double cjp = 0.0; int nj = njmo + 1; int nmat = (nj + 1) / 2; rac3j[nj - 1] = 1.0; rac3j[njmo - 1] = -dj / (cj * j1po); if (nmat != njmo) { int nbr = njmo - nmat; for (int ibr45 = 1; ibr45 <= nbr; ibr45++) { int irr = nj - ibr45; jf -= 2; j1--; j1s = j1 * j1; j1po = j1 + 1; cjp = cj; ccj = 1.0 * (j1s - jdfs) * (j1s - m1s); cj = sqrt(ccj * (jsmpos - j1s)); sjt = rac3j[irr - 1] * rac3j[irr - 1]; dj = 1.0 * jf * (j1 * j1po * mdf + idjc); rac3j[irr - 2] = -(rac3j[irr - 1] * dj + rac3j[irr] * cjp * j1) / (cj * j1po); sjr += (sjt * jf); } // ibr45 loop } // label 50 double osjt = sjt; sjt = rac3j[nmat - 1] * rac3j[nmat - 1]; if (sjt >= osjt) { sjr += (sjt * (jf - 2)); } else { // label 55 nmat++; } // label 60 double racmat = rac3j[nmat - 1]; rac3j[0] = 1.0; jf = jmn + jmn + 1; double sjl = 1.0 * jf; j1 = jmn; if (j1 != 0) { j1po = j1 + 1; int j1pos = j1po * j1po; double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s); cjp = sqrt(ccjp * (jsmpos - j1pos)); dj = 1.0 * jf * (j1 * j1po * mdf + idjc); rac3j[1] = - dj / (cjp * j1); } else { // label 62 cjp = sqrt(1.0 * (jsmpos - 1)); dj = 1.0 * mdf; rac3j[1] = -dj / cjp; } // label 63 int nmatmo = nmat - 1; if (nmatmo >= 2) { for (int irl70 = 2; irl70 <= nmatmo; irl70++) { jf += 2; j1++; j1po = j1 + 1; int j1pos = j1po * j1po; cj = cjp; double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s); cjp = sqrt(ccjp * (jsmpos - j1pos)); sjt = rac3j[irl70 - 1] * rac3j[irl70 - 1]; dj = 1.0 * jf * (j1 * j1po * mdf + idjc); rac3j[irl70] = -( rac3j[irl70 - 1] * dj + rac3j[irl70 - 2] * cj * j1po ) / (cjp * j1); sjl += (sjt * jf); } } // label 75 double ratrac = racmat / rac3j[nmat - 1]; double rats = ratrac * ratrac; double sj = sjr + sjl * rats; rac3j[nmat - 1] = racmat; double cnr = (1.0 / sqrt(sj)) * isn; for (int irr80 = nmat; irr80 <= nj; irr80++) rac3j[irr80 - 1] *= cnr; double cnl = cnr * ratrac; for (int irl85 = 1; irl85 <= nmatmo; irl85++) rac3j[irl85 - 1] *= cnl; } } } #pragma omp end declare target #endif // USE_TARGET_OFFLOAD void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const RuntimeSettings& rs) { magmaDoubleComplex *a = (magmaDoubleComplex *)mat[0]; // pointer to first element on host string message; Loading Loading
src/libnptm/magma_calls.cpp +447 −7 Original line number Diff line number Diff line Loading @@ -63,14 +63,69 @@ using namespace std; #define FOUR_PI 12.566370614359172 #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) inline magmaDoubleComplex magma_zadd(const magmaDoubleComplex& a, const magmaDoubleComplex& b) { return MAGMA_Z_MAKE(MAGMA_Z_REAL(a)+MAGMA_Z_REAL(b),MAGMA_Z_IMAG(a)+MAGMA_Z_IMAG(b)); } #pragma omp end declare target #pragma omp begin declare target device_type(any) inline magmaDoubleComplex magma_zprod(const magmaDoubleComplex& a, const magmaDoubleComplex& b) { return MAGMA_Z_MAKE( MAGMA_Z_REAL(a)*MAGMA_Z_REAL(b)-MAGMA_Z_IMAG(a)*MAGMA_Z_IMAG(b), MAGMA_Z_REAL(a)*MAGMA_Z_IMAG(b)+MAGMA_Z_IMAG(a)*MAGMA_Z_REAL(b) ); } #pragma omp end declare target #pragma omp begin declare target device_type(any) inline magmaDoubleComplex magma_zscale(const magmaDoubleComplex& a, const double& b) { return MAGMA_Z_MAKE(MAGMA_Z_REAL(a)*b,MAGMA_Z_IMAG(a)*b); } #pragma omp end declare target #pragma omp begin declare target device_type(any) double magma_cgev(int ipamo, int mu, int l, int m) { double xd, xn; if (ipamo == 0) { if (mu != 0) { xd = 2.0 * l * (l + 1); xn = (mu < 0) ? 1.0 * (l + m) * (l - m + 1) : 1.0 * (l - m) * (l + m + 1); double result = sqrt(xn / xd); return (mu <= 0) ? result : -result; } else { // mu == 0 return -1.0 * m / sqrt(1.0 * (l + 1) * l); } return 0.0; } else { // ipamo != 0 xd = 2.0 * l * (l * 2 - 1); xn = (mu == 0) ? 2.0 * (l - m) * (l + m) : ((mu < 0) ? 1.0 * (l - 1 + m) * (l + m) : 1.0 * (l - 1 - m) * (l - m)); return sqrt(xn / xd); } } #pragma omp end declare target magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int device_id) { const magmaDoubleComplex cz0 = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex cz1 = MAGMA_Z_MAKE(1.0, 0.0); const int nsph = c1->nsph; const int nsphmo = c1->nsph - 1; const int nsphmo = nsph - 1; const int ncou = nsph * nsphmo; const int li = c1->li; const int litpo = li + li + 1; const int litpos = litpo * litpo; const int le = c1->le; const int lmtpo = c1->lmtpo; const int lmtpos = c1->lmtpos; const int rsize = li * nsph; const int ind3j_size = (c1->lm + 1) * c1->lm; const int nv3j = c1->nv3j; const magma_int_t max_litpo = 2 * li + 1; const magma_int_t nlim = li * (li + 2); const magma_int_t ndi = nsph * nlim; Loading @@ -80,6 +135,16 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de const magma_int_t total_iters = num_pairs * li * max_litpo * li * max_litpo; magmaDoubleComplex *rmi = (magmaDoubleComplex *)(c1->rmi[0]); magmaDoubleComplex *rei = (magmaDoubleComplex *)(c1->rei[0]); magmaDoubleComplex vj = MAGMA_Z_MAKE(real(c1->vj), imag(c1->vj)); double *rxx = c1->rxx; double *ryy = c1->ryy; double *rzz = c1->rzz; int *ind3j = c1->ind3j[0]; double *v3j0 = c1->v3j0; magmaDoubleComplex *vh = (magmaDoubleComplex *)(c1->vh); magmaDoubleComplex *vyhj = (magmaDoubleComplex *)(c1->vyhj); magmaDoubleComplex *vj0 = (magmaDoubleComplex *)(c1->vj0); magmaDoubleComplex *vyj0 = (magmaDoubleComplex *)(c1->vyj0); /* THIS IS AN EXAMPLE OF WORKING ID INITIALIZATION #pragma omp target teams distribute parallel for \ Loading @@ -102,12 +167,15 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de } #pragma omp target teams distribute parallel for \ firstprivate(total_iters, max_litpo, num_pairs, li, ndi, ndit) \ firstprivate(total_iters, max_litpo, num_pairs, li, le, ndi, ndit, vj) \ map(to: lut_n1[0:num_pairs], lut_n2[0:num_pairs]) \ map(to: rxx[0:nsph], ryy[0:nsph], rzz[0:nsph]) \ map(to: ind3j[0:ind3j_size], v3j0[0:nv3j], vh[0:ncou*litpo]) \ map(to: vyhj[0:ncou*litpos], vj0[0:nsph*lmtpo], vyj0[0:nsph*lmtpos]) \ device(device_id) for (magma_int_t iter = 0; iter < total_iters; ++iter) { magma_int_t t = iter; // double rac3j[128]; double rac3j[MAX_GPU_STACK_DOUBLE]; // MAX_GPU_STACK_DOUBLE = 128 // Index calculation (from innermost to outermost) int im2 = (t % max_litpo) + 1; Loading Loading @@ -156,20 +224,28 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de // End of index 2 magnetic quantum numbers magmaDoubleComplex cgh, cgk, zvalue; cgh = (is_valid_iter) ? cz1 : // ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j) : // cz1 : // ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j) : magma_ghit<0>( 0, nbl, l1, m1, l2, m2, rxx, ryy, rzz, ind3j, v3j0, vh, vyhj, vj0, vyj0, vj, li, le, rac3j ) : cz0; cgk = (is_valid_iter) ? cz1 : // ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j) : // cz1 : // ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j) : magma_ghit<0>( 1, nbl, l1, m1, l2, m2, rxx, ryy, rzz, ind3j, v3j0, vh, vyhj, vj0, vyj0, vj, li, le, rac3j ) : cz0; if (is_valid_iter) { vec_am[(i1 - 1) * ndit + i2 - 1] = cgh; vec_am[(i1 - 1) * ndit + i2e - 1] = cgk; vec_am[(i1e - 1) * ndit + i2 - 1] = cgk; vec_am[(i1e - 1) * ndit + i2e - 1] = cgh; zvalue = MAGMA_Z_MUL(cgh, rsh); zvalue = magma_zprod(cgh, rsh); vec_am[(j1 - 1) * ndit + j2 - 1] = zvalue; vec_am[(j1e - 1) * ndit + j2e - 1] = zvalue; zvalue = MAGMA_Z_MUL(cgk, rsk); zvalue = magma_zprod(cgk, rsk); vec_am[(j1 - 1) * ndit + j2e - 1] = zvalue; vec_am[(j1e - 1) * ndit + j2 - 1] = zvalue; } Loading Loading @@ -209,6 +285,370 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de return MAGMA_SUCCESS; } #pragma omp begin declare target device_type(any) template<int IHI> magmaDoubleComplex magma_ghit( int ipamo, int nbl, int l1, int m1, int l2, int m2, double *rxx, double *ryy, double *rzz, int *ind3j, double *v3j0, magmaDoubleComplex *vh, magmaDoubleComplex *vyhj, magmaDoubleComplex *vj0, magmaDoubleComplex *vyj0, magmaDoubleComplex vj, int li, int le, double rac3j[] ) { /* NBL identifies transfer vector going from N2 to N1; * IHI=0 for Hankel, IHI=1 for Bessel, IHI=2 for Bessel from origin; * depending on IHI, IPAM=0 gives H or I, IPAM= 1 gives K or L. */ const magmaDoubleComplex cz0 = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex uim = MAGMA_Z_MAKE(0.0, 1.0); // label 10 int lm = (le > li) ? le : li; int l1mp = l1 - ipamo; int l1po = l1 + 1; int m1mm2 = m1 - m2; int m1mm2m = (m1mm2 > 0) ? m1mm2 + 1 : 1 - m1mm2; int lminpo = (l2 - l1mp > 0) ? l2 - l1mp + 1 : l1mp - l2 + 1; int lmaxpo = l2 + l1mp + 1; int i3j0in = ind3j[lm * l1mp + l2 - 1]; int ilin = (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ? 0 : -1; int isn = (m1 % 2 != 0) ? -1 : 1; if (lminpo % 2 == 0) { isn *= -1; if (l2 > l1mp) isn *= -1; } // label 12 int nblmo = nbl - 1; if constexpr (IHI == 0) { int litpo = li + li + 1; int litpos = litpo * litpo; int nbhj = nblmo * litpo; int nby = nblmo * litpos; magmaDoubleComplex result = cz0; for (int jm24 = 1; jm24 <= 3; jm24++) { magmaDoubleComplex csum = cz0; int mu = jm24 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt14 = lminpo; while (lt14 <= lmaxpo) { i3j0++; int l3 = lt14 - 1; int ny = l3 * l3 + lt14; double aors = 1.0 * (l3 + lt14); magmaDoubleComplex f3j = MAGMA_Z_MAKE((v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); // dcomplex cfun = (vh[nbhj + lt14 - 1] * vyhj[nby + ny - 1]) * f3j; magmaDoubleComplex cfun = magma_zprod(vh[nbhj + lt14 - 1], vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, fcfun); jsn *= -1; lt14 += 2; } // goes to 22 } else { // label 16 magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt20 = lminpo; while (lt20 <= lmaxpo) { i3j0++; if (m1mm2m <= lt20) { il += 2; int l3 = lt20 - 1; int ny = l3 * l3 + lt20 + m1mm2; double aors = 1.0 * (l3 + lt20); magmaDoubleComplex f3j = MAGMA_Z_MAKE((rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); // dcomplex cfun = (vh[nbhj + lt20 - 1] * vyhj[nby + ny - 1]) * f3j; magmaDoubleComplex cfun = magma_zprod(vh[nbhj + lt20 - 1], vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, cfun); } // label 20 jsn *= -1; lt20 += 2; } } // label 22 //csum *= cr; csum = magma_zscale(csum, cr); result = magma_zadd(result, csum); } // Otherwise there is nothing to add } // jm24 loop. Should go to 70 magmaDoubleComplex cr = (ipamo == 1) ? MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) : MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0); return magma_zprod(result, cr); } else if constexpr (IHI == 1) { int litpo = li + li + 1; int litpos = litpo * litpo; int nbhj = nblmo * litpo; int nby = nblmo * litpos; magmaDoubleComplex result = cz0; for (int jm44 = 1; jm44 <= 3; jm44++) { magmaDoubleComplex csum = cz0; int mu = jm44 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = - isn; if (mu == 0) jsn = isn; double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt34 = lminpo; while (lt34 <= lmaxpo) { i3j0++; int l3 = lt34 - 1; int ny = l3 * l3 + lt34; double aors = 1.0 * (l3 + lt34); magmaDoubleComplex f3j = MAGMA_Z_MAKE((v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj, vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, fcfun); jsn *= -1; lt34 += 2; } // goes to 42 } else { // label 36 magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt40 = lminpo; while (lt40 <= lmaxpo) { i3j0++; if (m1mm2m <= lt40) { il += 2; int l3 = lt40 - 1; int ny = l3 * l3 + lt40 + m1mm2; double aors = 1.0 * (l3 + lt40); magmaDoubleComplex f3j = MAGMA_Z_MAKE((rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj, vyhj[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, cfun); } // label 40 jsn *= -1; lt40 += 2; } } // label 42 csum = magma_zscale(csum, cr); result = magma_zadd(result, csum); } // Otherwise there is nothing to add } // jm44 loop. Should go to 70 magmaDoubleComplex cr = (ipamo == 1) ? MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) : MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0); return magma_zprod(result, cr); } else { // The project grants that IHI = 2 whenever it is neither 0 nor 1. int lmtpo = lm + lm + 1; int lmtpos = lmtpo * lmtpo; int nbhj = nblmo * lmtpo; int nby = nblmo * lmtpos; magmaDoubleComplex result = cz0; if (rxx[nbl - 1] == 0.0 && ryy[nbl - 1] == 0.0 && rzz[nbl - 1] == 0.0) { if (ipamo == 0) { if (l1 == l2 && m1 == m2) return MAGMA_Z_MAKE(1.0, 0.0); } return MAGMA_Z_MAKE(0.0, 0.0); } for (int jm64 = 1; jm64 <= 3; jm64++) { magmaDoubleComplex csum = cz0; int mu = jm64 - 2; int mupm1 = mu + m1; int mupm2 = mu + m2; if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { int jsn = -isn; if (mu == 0) jsn = isn; double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2); int i3j0 = i3j0in; if (mupm1 == 0 && mupm2 == 0) { int lt54 = lminpo; while (lt54 <= lmaxpo) { i3j0++; int l3 = lt54 - 1; int ny = l3 * l3 + lt54; double aors = 1.0 * (l3 + lt54); magmaDoubleComplex f3j = MAGMA_Z_MAKE(v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj0[nbhj + lt54 - 1], vyj0[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, fcfun); jsn *= -1; lt54 += 2; } // goes to 62 } else { // label 56 magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j); int il = ilin; int lt60 = lminpo; while (lt60 <= lmaxpo) { i3j0++; if (m1mm2m <= lt60) { il += 2; int l3 = lt60 - 1; int ny = l3 * l3 + lt60 + m1mm2; double aors = 1.0 * (l3 + lt60); magmaDoubleComplex f3j = MAGMA_Z_MAKE(rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors) * jsn, 0.0); magmaDoubleComplex cfun = magma_zprod(vj0[nbhj + lt60 - 1], vyj0[nby + ny - 1]); magmaDoubleComplex fcfun = magma_zprod(cfun, f3j); csum = magma_zadd(csum, cfun); } // label 60 jsn *= -1; lt60 += 2; } } // label 62 csum = magma_zscale(csum, cr); result = magma_zadd(result, csum); } // Otherwise there is nothing to add } // jm64 loop. Should go to 70 magmaDoubleComplex cr = (ipamo == 1) ? MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) : MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0); return magma_zprod(result, cr); } } #pragma omp end declare target #pragma omp begin declare target device_type(any) /** * \brief Compute the 3j symbol for Clebsch-Gordan coefficients for JJ transitions. * * This function calculates the 3j(J,J2,J3;-M2-M3,M2,M3) symbol for the Clebsch-Gordan * coefficients. See Appendix a.3.1 in Borghese, Denti & Saija (2007). This function * is a specialized version to work on GPUs with MAGMA. * * \param j2: `int` Value of J2. [IN] * \param j3: `int` Value of J3. [IN] * \param m2: `int` Value of M2. [IN] * \param m3: `int` Value of M3. [IN] * \param rac3j: `double[]` Vector of 3j symbols. [OUT] */ void magma_r3jjr(int j2, int j3, int m2, int m3, double rac3j[]) { int jmx = j3 + j2; int jdf = j3 - j2; int m1 = -m2 - m3; int abs_jdf = (jdf >= 0) ? jdf : -jdf; int abs_m1 = (m1 >= 0) ? m1 : -m1; int jmn = (abs_jdf > abs_m1) ? abs_jdf : abs_m1; int njmo = jmx - jmn; int jf = jmx + jmx + 1; int isn = ((jdf + m1) % 2 == 0) ? 1 : -1; if (njmo <= 0) { double sj = 1.0 * jf; double cnr = (1.0 / sqrt(sj)) * isn; rac3j[0] = cnr; } else { // label 15 double sjt = 1.0; double sjr = 1.0 * jf; int jsmpos = (jmx + 1) * (jmx + 1); int jdfs = jdf * jdf; int m1s = m1 * m1; int mdf = m3 - m2; int idjc = m1 * (j3 * (j3 + 1) - j2 * (j2 +1)); int j1 = jmx; int j1s = j1 * j1; int j1po = j1 + 1; double ccj = 1.0 * (j1s - jdfs) * (j1s - m1s); double cj = sqrt(ccj * (jsmpos - j1s)); double dj = 1.0 * jf * (j1 * j1po * mdf + idjc); if (njmo <= 1) { rac3j[0] = -dj / (cj * j1po); double sj = sjr + (rac3j[0] * rac3j[0]) * (jf - 2); double cnr = (1.0 / sqrt(sj)) * isn; rac3j[1] = cnr; rac3j[0] *= cnr; } else { // label 20 double cjp = 0.0; int nj = njmo + 1; int nmat = (nj + 1) / 2; rac3j[nj - 1] = 1.0; rac3j[njmo - 1] = -dj / (cj * j1po); if (nmat != njmo) { int nbr = njmo - nmat; for (int ibr45 = 1; ibr45 <= nbr; ibr45++) { int irr = nj - ibr45; jf -= 2; j1--; j1s = j1 * j1; j1po = j1 + 1; cjp = cj; ccj = 1.0 * (j1s - jdfs) * (j1s - m1s); cj = sqrt(ccj * (jsmpos - j1s)); sjt = rac3j[irr - 1] * rac3j[irr - 1]; dj = 1.0 * jf * (j1 * j1po * mdf + idjc); rac3j[irr - 2] = -(rac3j[irr - 1] * dj + rac3j[irr] * cjp * j1) / (cj * j1po); sjr += (sjt * jf); } // ibr45 loop } // label 50 double osjt = sjt; sjt = rac3j[nmat - 1] * rac3j[nmat - 1]; if (sjt >= osjt) { sjr += (sjt * (jf - 2)); } else { // label 55 nmat++; } // label 60 double racmat = rac3j[nmat - 1]; rac3j[0] = 1.0; jf = jmn + jmn + 1; double sjl = 1.0 * jf; j1 = jmn; if (j1 != 0) { j1po = j1 + 1; int j1pos = j1po * j1po; double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s); cjp = sqrt(ccjp * (jsmpos - j1pos)); dj = 1.0 * jf * (j1 * j1po * mdf + idjc); rac3j[1] = - dj / (cjp * j1); } else { // label 62 cjp = sqrt(1.0 * (jsmpos - 1)); dj = 1.0 * mdf; rac3j[1] = -dj / cjp; } // label 63 int nmatmo = nmat - 1; if (nmatmo >= 2) { for (int irl70 = 2; irl70 <= nmatmo; irl70++) { jf += 2; j1++; j1po = j1 + 1; int j1pos = j1po * j1po; cj = cjp; double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s); cjp = sqrt(ccjp * (jsmpos - j1pos)); sjt = rac3j[irl70 - 1] * rac3j[irl70 - 1]; dj = 1.0 * jf * (j1 * j1po * mdf + idjc); rac3j[irl70] = -( rac3j[irl70 - 1] * dj + rac3j[irl70 - 2] * cj * j1po ) / (cjp * j1); sjl += (sjt * jf); } } // label 75 double ratrac = racmat / rac3j[nmat - 1]; double rats = ratrac * ratrac; double sj = sjr + sjl * rats; rac3j[nmat - 1] = racmat; double cnr = (1.0 / sqrt(sj)) * isn; for (int irr80 = nmat; irr80 <= nj; irr80++) rac3j[irr80 - 1] *= cnr; double cnl = cnr * ratrac; for (int irl85 = 1; irl85 <= nmatmo; irl85++) rac3j[irl85 - 1] *= cnl; } } } #pragma omp end declare target #endif // USE_TARGET_OFFLOAD void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const RuntimeSettings& rs) { magmaDoubleComplex *a = (magmaDoubleComplex *)mat[0]; // pointer to first element on host string message; Loading