Commit af01e8e8 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Implement GPU resident cms() logic

parent 435e0312
Loading
Loading
Loading
Loading
+447 −7
Original line number Diff line number Diff line
@@ -63,14 +63,69 @@

using namespace std;

#define FOUR_PI 12.566370614359172

#ifdef USE_TARGET_OFFLOAD

#pragma omp begin declare target device_type(any)
inline magmaDoubleComplex magma_zadd(const magmaDoubleComplex& a, const magmaDoubleComplex& b) {
  return MAGMA_Z_MAKE(MAGMA_Z_REAL(a)+MAGMA_Z_REAL(b),MAGMA_Z_IMAG(a)+MAGMA_Z_IMAG(b));
}
#pragma omp end declare target

#pragma omp begin declare target device_type(any)
inline magmaDoubleComplex magma_zprod(const magmaDoubleComplex& a, const magmaDoubleComplex& b) {
  return MAGMA_Z_MAKE(
    MAGMA_Z_REAL(a)*MAGMA_Z_REAL(b)-MAGMA_Z_IMAG(a)*MAGMA_Z_IMAG(b),
    MAGMA_Z_REAL(a)*MAGMA_Z_IMAG(b)+MAGMA_Z_IMAG(a)*MAGMA_Z_REAL(b)
  );
}
#pragma omp end declare target

#pragma omp begin declare target device_type(any)
inline magmaDoubleComplex magma_zscale(const magmaDoubleComplex& a, const double& b) {
  return MAGMA_Z_MAKE(MAGMA_Z_REAL(a)*b,MAGMA_Z_IMAG(a)*b);
}
#pragma omp end declare target

#pragma omp begin declare target device_type(any)
double magma_cgev(int ipamo, int mu, int l, int m) {
  double xd, xn;
  if (ipamo == 0) {
    if (mu != 0) {
      xd = 2.0 * l * (l + 1);
      xn = (mu < 0) ? 1.0 * (l + m) * (l - m + 1)
	: 1.0 * (l - m) * (l + m + 1);
      double result = sqrt(xn / xd);
      return (mu <= 0) ? result : -result;
    } else { // mu == 0
      return -1.0 * m / sqrt(1.0 * (l + 1) * l);
    }
    return 0.0;
  } else { // ipamo != 0
    xd = 2.0 * l * (l * 2 - 1);
    xn = (mu == 0) ? 2.0 * (l - m) * (l + m)
      : ((mu < 0) ? 1.0 * (l - 1 + m) * (l + m) : 1.0 * (l - 1 - m) * (l - m));
    return sqrt(xn / xd);
  }
}
#pragma omp end declare target

magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int device_id) {
  const magmaDoubleComplex cz0 = MAGMA_Z_MAKE(0.0, 0.0);
  const magmaDoubleComplex cz1 = MAGMA_Z_MAKE(1.0, 0.0); 
  const int nsph = c1->nsph;
  const int nsphmo = c1->nsph - 1;
  const int nsphmo = nsph - 1;
  const int ncou = nsph * nsphmo;
  const int li = c1->li;
  const int litpo = li + li + 1;
  const int litpos = litpo * litpo;
  const int le = c1->le;
  const int lmtpo = c1->lmtpo;
  const int lmtpos = c1->lmtpos;
  const int rsize = li * nsph;
  const int ind3j_size = (c1->lm + 1) * c1->lm;
  const int nv3j = c1->nv3j;
  const magma_int_t max_litpo = 2 * li + 1;
  const magma_int_t nlim = li * (li + 2);
  const magma_int_t ndi = nsph * nlim;
@@ -80,6 +135,16 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de
  const magma_int_t total_iters = num_pairs * li * max_litpo * li * max_litpo;
  magmaDoubleComplex *rmi = (magmaDoubleComplex *)(c1->rmi[0]);
  magmaDoubleComplex *rei = (magmaDoubleComplex *)(c1->rei[0]);
  magmaDoubleComplex vj = MAGMA_Z_MAKE(real(c1->vj), imag(c1->vj));
  double *rxx = c1->rxx;
  double *ryy = c1->ryy;
  double *rzz = c1->rzz;
  int *ind3j = c1->ind3j[0];
  double *v3j0 = c1->v3j0;
  magmaDoubleComplex *vh = (magmaDoubleComplex *)(c1->vh);
  magmaDoubleComplex *vyhj = (magmaDoubleComplex *)(c1->vyhj);
  magmaDoubleComplex *vj0 = (magmaDoubleComplex *)(c1->vj0);
  magmaDoubleComplex *vyj0 = (magmaDoubleComplex *)(c1->vyj0);

  /* THIS IS AN EXAMPLE OF WORKING ID INITIALIZATION
#pragma omp target teams distribute parallel for \
@@ -102,12 +167,15 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de
  }
  
#pragma omp target teams distribute parallel for \
  firstprivate(total_iters, max_litpo, num_pairs, li, ndi, ndit) \
  firstprivate(total_iters, max_litpo, num_pairs, li, le, ndi, ndit, vj) \
  map(to: lut_n1[0:num_pairs], lut_n2[0:num_pairs]) \
  map(to: rxx[0:nsph], ryy[0:nsph], rzz[0:nsph]) \
  map(to: ind3j[0:ind3j_size], v3j0[0:nv3j], vh[0:ncou*litpo]) \
  map(to: vyhj[0:ncou*litpos], vj0[0:nsph*lmtpo], vyj0[0:nsph*lmtpos]) \
  device(device_id)
  for (magma_int_t iter = 0; iter < total_iters; ++iter) {
    magma_int_t t = iter;
    // double rac3j[128];
    double rac3j[MAX_GPU_STACK_DOUBLE]; // MAX_GPU_STACK_DOUBLE = 128

    // Index calculation (from innermost to outermost)
    int im2 = (t % max_litpo) + 1;
@@ -156,20 +224,28 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de
    // End of index 2 magnetic quantum numbers
    magmaDoubleComplex cgh, cgk, zvalue;
    cgh = (is_valid_iter) ?
      cz1 : // ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j) :
      // cz1 : // ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j) :
      magma_ghit<0>(
	0, nbl, l1, m1, l2, m2, rxx, ryy, rzz, ind3j, v3j0, vh,
	vyhj, vj0, vyj0, vj, li, le, rac3j
      ) :
      cz0;
    cgk = (is_valid_iter) ?
      cz1 : // ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j) :
      // cz1 : // ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j) :
      magma_ghit<0>(
        1, nbl, l1, m1, l2, m2, rxx, ryy, rzz, ind3j, v3j0, vh,
	vyhj, vj0, vyj0, vj, li, le, rac3j
      ) :
      cz0;
    if (is_valid_iter) {
      vec_am[(i1 - 1) * ndit + i2 - 1] = cgh;
      vec_am[(i1 - 1) * ndit + i2e - 1] = cgk;
      vec_am[(i1e - 1) * ndit + i2 - 1] = cgk;
      vec_am[(i1e - 1) * ndit + i2e - 1] = cgh;
      zvalue = MAGMA_Z_MUL(cgh, rsh);
      zvalue = magma_zprod(cgh, rsh);
      vec_am[(j1 - 1) * ndit + j2 - 1] = zvalue;
      vec_am[(j1e - 1) * ndit + j2e - 1] = zvalue;
      zvalue = MAGMA_Z_MUL(cgk, rsk);
      zvalue = magma_zprod(cgk, rsk);
      vec_am[(j1 - 1) * ndit + j2e - 1] = zvalue;
      vec_am[(j1e - 1) * ndit + j2 - 1] = zvalue;
    }
@@ -209,6 +285,370 @@ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int de
  return MAGMA_SUCCESS;
}

#pragma omp begin declare target device_type(any)
template<int IHI> magmaDoubleComplex magma_ghit(
  int ipamo, int nbl, int l1, int m1, int l2, int m2,
  double *rxx, double *ryy, double *rzz, int *ind3j,
  double *v3j0, magmaDoubleComplex *vh, magmaDoubleComplex *vyhj,
  magmaDoubleComplex *vj0, magmaDoubleComplex *vyj0,
  magmaDoubleComplex vj, int li, int le, double rac3j[]
) {
  /* NBL identifies transfer vector going from N2 to N1;
   * IHI=0 for Hankel, IHI=1 for Bessel, IHI=2 for Bessel from origin;
   * depending on IHI, IPAM=0 gives H or I, IPAM= 1 gives K or L. */
  const magmaDoubleComplex cz0 = MAGMA_Z_MAKE(0.0, 0.0);
  const magmaDoubleComplex uim = MAGMA_Z_MAKE(0.0, 1.0);
  
  // label 10
  int lm = (le > li) ? le : li;
  int l1mp = l1 - ipamo;
  int l1po = l1 + 1;
  int m1mm2 = m1 - m2;
  int m1mm2m = (m1mm2 > 0) ? m1mm2 + 1 : 1 - m1mm2;
  int lminpo = (l2 - l1mp > 0) ? l2 - l1mp + 1 : l1mp - l2 + 1;
  int lmaxpo = l2 + l1mp + 1;
  int i3j0in = ind3j[lm * l1mp + l2 - 1];
  int ilin = (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ? 0 : -1;
  int isn = (m1 % 2 != 0) ? -1 : 1;
  if (lminpo % 2 == 0) {
    isn *= -1;
    if (l2 > l1mp) isn *= -1;
  }
  // label 12
  int nblmo = nbl - 1;
  
  if constexpr (IHI == 0) {
    int litpo = li + li + 1;
    int litpos = litpo * litpo;
    int nbhj = nblmo * litpo;
    int nby = nblmo * litpos;
    magmaDoubleComplex result = cz0;
    for (int jm24 = 1; jm24 <= 3; jm24++) {
      magmaDoubleComplex csum = cz0;
      int mu = jm24 - 2;
      int mupm1 = mu + m1;
      int mupm2 = mu + m2;
      if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) {
	int jsn = -isn;
	if (mu == 0) jsn = isn;
	double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2);
	int i3j0 = i3j0in;
	if (mupm1 == 0 && mupm2 == 0) {
	  int lt14 = lminpo;
	  while (lt14 <= lmaxpo) {
	    i3j0++;
	    int l3 = lt14 - 1;
	    int ny = l3 * l3 + lt14;
	    double aors = 1.0 * (l3 + lt14);
	    magmaDoubleComplex f3j = MAGMA_Z_MAKE((v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0);
	    // dcomplex cfun = (vh[nbhj + lt14 - 1] * vyhj[nby + ny - 1]) * f3j;
	    magmaDoubleComplex cfun = magma_zprod(vh[nbhj + lt14 - 1], vyhj[nby + ny - 1]);
	    magmaDoubleComplex fcfun = magma_zprod(cfun, f3j);
	    csum = magma_zadd(csum, fcfun);
	    jsn *= -1;
	    lt14 += 2;
	  }
	  // goes to 22
	} else { // label 16
	  magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j);
	  int il = ilin;
	  int lt20 = lminpo;
	  while (lt20 <= lmaxpo) {
	    i3j0++;
	    if (m1mm2m <= lt20) {
	      il += 2;
	      int l3 = lt20 - 1;
	      int ny = l3 * l3  + lt20 + m1mm2;
	      double aors = 1.0 * (l3 + lt20);
	      magmaDoubleComplex f3j = MAGMA_Z_MAKE((rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0);
	      // dcomplex cfun = (vh[nbhj + lt20 - 1] * vyhj[nby + ny - 1]) * f3j;
	      magmaDoubleComplex cfun = magma_zprod(vh[nbhj + lt20 - 1], vyhj[nby + ny - 1]);
	      magmaDoubleComplex fcfun = magma_zprod(cfun, f3j);
	      csum = magma_zadd(csum, cfun);
	    }
	    // label 20
	    jsn *= -1;
	    lt20 += 2;
	  }
	}
	// label 22
	//csum *= cr;
	csum = magma_zscale(csum, cr);
	result = magma_zadd(result, csum);
      }
      // Otherwise there is nothing to add
    } // jm24 loop. Should go to 70
    magmaDoubleComplex cr = (ipamo == 1) ?
      MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) :
      MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0);
    return magma_zprod(result, cr);
  } else if constexpr (IHI == 1) {
    int litpo = li + li + 1;
    int litpos = litpo * litpo;
    int nbhj = nblmo * litpo;
    int nby = nblmo * litpos;
    magmaDoubleComplex result = cz0;
    for (int jm44 = 1; jm44 <= 3; jm44++) {
      magmaDoubleComplex csum = cz0;
      int mu = jm44 - 2;
      int mupm1 = mu + m1;
      int mupm2 = mu + m2;
      if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) {
	int jsn = - isn;
	if (mu == 0) jsn = isn;
	double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2);
	int i3j0 = i3j0in;
	if (mupm1 == 0 && mupm2 == 0) {
	  int lt34 = lminpo;
	  while (lt34 <= lmaxpo) {
	    i3j0++;
	    int l3 = lt34 - 1;
	    int ny = l3 * l3 + lt34;
	    double aors = 1.0 * (l3 + lt34);
	    magmaDoubleComplex f3j = MAGMA_Z_MAKE((v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0);
	    magmaDoubleComplex cfun = magma_zprod(vj, vyhj[nby + ny - 1]);
	    magmaDoubleComplex fcfun = magma_zprod(cfun, f3j);
	    csum = magma_zadd(csum, fcfun);
	    jsn *= -1;
	    lt34 += 2;
	  }
	  // goes to 42
	} else { // label 36
	  magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j);
	  int il = ilin;
	  int lt40 = lminpo;
	  while (lt40 <= lmaxpo) {
	    i3j0++;
	    if (m1mm2m <= lt40) {
	      il += 2;
	      int l3 = lt40 - 1;
	      int ny = l3 * l3  + lt40 + m1mm2;
	      double aors = 1.0 * (l3 + lt40);
	      magmaDoubleComplex f3j = MAGMA_Z_MAKE((rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors)) * jsn, 0.0);
	      magmaDoubleComplex cfun = magma_zprod(vj, vyhj[nby + ny - 1]);
	      magmaDoubleComplex fcfun = magma_zprod(cfun, f3j);
	      csum = magma_zadd(csum, cfun);
	    }
	    // label 40
	    jsn *= -1;
	    lt40 += 2;
	  }
	}
	// label 42
	csum = magma_zscale(csum, cr);
	result = magma_zadd(result, csum);
      }
      // Otherwise there is nothing to add
    } // jm44 loop. Should go to 70
    magmaDoubleComplex cr = (ipamo == 1) ?
      MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) :
      MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0);
    return magma_zprod(result, cr);
  } else { // The project grants that IHI = 2 whenever it is neither 0 nor 1.
    int lmtpo = lm + lm + 1;
    int lmtpos = lmtpo * lmtpo;
    int nbhj = nblmo * lmtpo;
    int nby = nblmo * lmtpos;
    magmaDoubleComplex result = cz0;
    if (rxx[nbl - 1] == 0.0 && ryy[nbl - 1] == 0.0 && rzz[nbl - 1] == 0.0) {
      if (ipamo == 0) {
	if (l1 == l2 && m1 == m2) return MAGMA_Z_MAKE(1.0, 0.0);
      }
      return MAGMA_Z_MAKE(0.0, 0.0);
    }
    for (int jm64 = 1; jm64 <= 3; jm64++) {
      magmaDoubleComplex csum = cz0;
      int mu = jm64 - 2;
      int mupm1 = mu + m1;
      int mupm2 = mu + m2;
      if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) {
	int jsn = -isn;
	if (mu == 0) jsn = isn;
	double cr = magma_cgev(ipamo, mu, l1, m1) * magma_cgev(0, mu, l2, m2);
	int i3j0 = i3j0in;
	if (mupm1 == 0 && mupm2 == 0) {
	  int lt54 = lminpo;
	  while (lt54 <= lmaxpo) {
	    i3j0++;
	    int l3 = lt54 - 1;
	    int ny = l3 * l3 + lt54;
	    double aors = 1.0 * (l3 + lt54);
	    magmaDoubleComplex f3j = MAGMA_Z_MAKE(v3j0[i3j0 - 1] * v3j0[i3j0 - 1] * sqrt(aors) * jsn, 0.0);
	    magmaDoubleComplex cfun = magma_zprod(vj0[nbhj + lt54 - 1], vyj0[nby + ny - 1]);
	    magmaDoubleComplex fcfun = magma_zprod(cfun, f3j);
	    csum = magma_zadd(csum, fcfun);
	    jsn *= -1;
	    lt54 += 2;
	  }
	  // goes to 62
	} else { // label 56
	  magma_r3jjr(l1mp, l2, -mupm1, mupm2, rac3j);
	  int il = ilin;
	  int lt60 = lminpo;
	  while (lt60 <= lmaxpo) {
	    i3j0++;
	    if (m1mm2m <= lt60) {
	      il += 2;
	      int l3 = lt60 - 1;
	      int ny = l3 * l3  + lt60 + m1mm2;
	      double aors = 1.0 * (l3 + lt60);
	      magmaDoubleComplex f3j = MAGMA_Z_MAKE(rac3j[il - 1] * v3j0[i3j0 - 1] * sqrt(aors) * jsn, 0.0);
	      magmaDoubleComplex cfun = magma_zprod(vj0[nbhj + lt60 - 1], vyj0[nby + ny - 1]);
	      magmaDoubleComplex fcfun = magma_zprod(cfun, f3j);
	      csum = magma_zadd(csum, cfun);
	    }
	    // label 60
	    jsn *= -1;
	    lt60 += 2;
	  }
	}
	// label 62
	csum = magma_zscale(csum, cr);
	result = magma_zadd(result, csum);
      }
      // Otherwise there is nothing to add
    } // jm64 loop. Should go to 70
    magmaDoubleComplex cr = (ipamo == 1) ?
      MAGMA_Z_MAKE(0.0, sqrt(FOUR_PI * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po)) :
      MAGMA_Z_MAKE(sqrt(FOUR_PI * (l1 + l1po) * (l2 + l2 + 1)), 0.0);
    return magma_zprod(result, cr);
  }
}
#pragma omp end declare target


#pragma omp begin declare target device_type(any)
/**
 * \brief Compute the 3j symbol for Clebsch-Gordan coefficients for JJ transitions.
 *
 * This function calculates the 3j(J,J2,J3;-M2-M3,M2,M3) symbol for the Clebsch-Gordan
 * coefficients. See Appendix a.3.1 in Borghese, Denti & Saija (2007). This function
 * is a specialized version to work on GPUs with MAGMA.
 *
 * \param j2: `int` Value of J2. [IN]
 * \param j3: `int` Value of J3. [IN]
 * \param m2: `int` Value of M2. [IN]
 * \param m3: `int` Value of M3. [IN]
 * \param rac3j: `double[]` Vector of 3j symbols. [OUT]
 */
void magma_r3jjr(int j2, int j3, int m2, int m3, double rac3j[]) {
  int jmx = j3 + j2;
  int jdf = j3 - j2;
  int m1 = -m2 - m3;
  int abs_jdf = (jdf >= 0) ? jdf : -jdf;
  int abs_m1 = (m1 >= 0) ? m1 : -m1;
  int jmn = (abs_jdf > abs_m1) ? abs_jdf : abs_m1;
  int njmo = jmx - jmn;
  int jf = jmx + jmx + 1;
  int isn = ((jdf + m1) % 2 == 0) ? 1 : -1;
  if (njmo <= 0) {
    double sj = 1.0 * jf;
    double cnr = (1.0 / sqrt(sj)) * isn;
    rac3j[0] = cnr;
  } else { // label 15
    double sjt = 1.0;
    double sjr = 1.0 * jf;
    int jsmpos = (jmx + 1) * (jmx + 1);
    int jdfs = jdf * jdf;
    int m1s = m1 * m1;
    int mdf = m3 - m2;
    int idjc = m1 * (j3 * (j3 + 1) - j2 * (j2 +1));
    int j1 = jmx;
    int j1s = j1 * j1;
    int j1po = j1 + 1;
    double ccj = 1.0 * (j1s - jdfs) * (j1s - m1s);
    double cj = sqrt(ccj * (jsmpos - j1s));
    double dj = 1.0 * jf * (j1 * j1po * mdf + idjc);
    if (njmo <= 1) {
      rac3j[0] = -dj / (cj * j1po);
      double sj = sjr + (rac3j[0] * rac3j[0]) * (jf - 2);
      double cnr = (1.0 / sqrt(sj)) * isn;
      rac3j[1] = cnr;
      rac3j[0] *= cnr;
    } else { // label 20
      double cjp = 0.0;
      int nj = njmo + 1;
      int nmat = (nj + 1) / 2;
      rac3j[nj - 1] = 1.0;
      rac3j[njmo - 1] = -dj / (cj * j1po);
      if (nmat != njmo) {
	int nbr = njmo - nmat;
	for (int ibr45 = 1; ibr45 <= nbr; ibr45++) {
	  int irr = nj - ibr45;
	  jf -= 2;
	  j1--;
	  j1s = j1 * j1;
	  j1po = j1 + 1;
	  cjp = cj;
	  ccj = 1.0 * (j1s - jdfs) * (j1s - m1s);
	  cj = sqrt(ccj * (jsmpos - j1s));
	  sjt = rac3j[irr - 1] * rac3j[irr - 1];
	  dj = 1.0 * jf * (j1 * j1po * mdf + idjc);
	  rac3j[irr - 2] = -(rac3j[irr - 1] * dj + rac3j[irr] * cjp * j1) / (cj * j1po);
	  sjr += (sjt * jf);
	} // ibr45 loop
      }
      // label 50
      double osjt = sjt;
      sjt = rac3j[nmat - 1] * rac3j[nmat - 1];
      if (sjt >= osjt) {
	sjr += (sjt * (jf - 2));
      } else { // label 55
	nmat++;
      }
      // label 60
      double racmat = rac3j[nmat - 1];
      rac3j[0] = 1.0;
      jf = jmn + jmn + 1;
      double sjl = 1.0 * jf;
      j1 = jmn;
      if (j1 != 0) {
	j1po = j1 + 1;
	int j1pos = j1po * j1po;
	double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s);
	cjp = sqrt(ccjp * (jsmpos - j1pos));
	dj = 1.0 * jf * (j1 * j1po * mdf + idjc);
	rac3j[1] = - dj / (cjp * j1);
      } else { // label 62
	cjp = sqrt(1.0 * (jsmpos - 1));
	dj = 1.0 * mdf;
	rac3j[1] = -dj / cjp;
      }
      // label 63
      int nmatmo = nmat - 1;
      if (nmatmo >= 2) {
	for (int irl70 = 2; irl70 <= nmatmo; irl70++) {
	  jf += 2;
	  j1++;
	  j1po = j1 + 1;
	  int j1pos = j1po * j1po;
	  cj = cjp;
	  double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s);
	  cjp = sqrt(ccjp * (jsmpos - j1pos));
	  sjt = rac3j[irl70 - 1] * rac3j[irl70 - 1];
	  dj = 1.0 * jf * (j1 * j1po * mdf + idjc);
	  rac3j[irl70] = -(
	    rac3j[irl70 - 1] * dj + rac3j[irl70 - 2] * cj * j1po
	  ) / (cjp * j1);
	  sjl += (sjt * jf);
	}
      }
      // label 75
      double ratrac = racmat / rac3j[nmat - 1];
      double rats = ratrac * ratrac;
      double sj = sjr + sjl * rats;
      rac3j[nmat - 1] = racmat;
      double cnr = (1.0 / sqrt(sj)) * isn;
      for (int irr80 = nmat; irr80 <= nj; irr80++) rac3j[irr80 - 1] *= cnr;
      double cnl = cnr * ratrac;
      for (int irl85 = 1; irl85 <= nmatmo; irl85++) rac3j[irl85 - 1] *= cnl;
    }
  }
}
#pragma omp end declare target

#endif // USE_TARGET_OFFLOAD

void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const RuntimeSettings& rs) {
  magmaDoubleComplex *a = (magmaDoubleComplex *)mat[0]; // pointer to first element on host
  string message;