Commit e8b6a0e5 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Implement OpenMP CPU parallelized LUCIN()

parent 4faff7e6
Loading
Loading
Loading
Loading
+25 −17
Original line number Diff line number Diff line
@@ -388,6 +388,9 @@ void apcra(
  delete[] svs;
}

#ifdef USE_TARGET_OFFLOAD
#pragma omp begin declare target device_type(any)
#endif
dcomplex cdtp(dcomplex z, dcomplex *vec_am, int i, int jf, int k, int nj, np_int istep) {
  dcomplex result = z;
  if (nj > 0) {
@@ -398,6 +401,9 @@ dcomplex cdtp(dcomplex z, dcomplex *vec_am, int i, int jf, int k, int nj, np_int
  }
  return result;
}
#ifdef USE_TARGET_OFFLOAD
#pragma omp end declare target
#endif

// #ifdef USE_TARGET_OFFLOAD
// #pragma omp begin declare target device_type(any)
@@ -1145,7 +1151,7 @@ void hjv(
  delete[] rfn;
}

void lucin(dcomplex **am, const np_int nddmst, np_int n, int &ier) {
int lucin(dcomplex *vec_am, const np_int nddmst, np_int n) {
  /* NDDMST  FIRST DIMENSION OF AM AS DECLARED IN DIMENSION
   *         STATEMENT.
   * N       NUMBER OF ROWS IN AM.
@@ -1153,22 +1159,24 @@ void lucin(dcomplex **am, const np_int nddmst, np_int n, int &ier) {
   */
  double *v = new double[nddmst];
  double *vi = new double[nddmst]();
  dcomplex *vec_am = am[0];
  const dcomplex cc0 = 0.0 + 0.0 * I;
  ier = 0;
  int ier = 0;
  int nminus = n - 1;
#pragma omp parallel for
  for (int64_t i = 1; i <= n; i++) {
#pragma omp parallel for reduction(+: vi[i - 1])
    for (int64_t j = 1; j <= n; j++) {
      vi[i - 1] += (
	real(vec_am[(i - 1) * n + j - 1]) * real(vec_am[(i - 1) * n + j - 1])
	+ imag(vec_am[(i - 1) * n + j - 1]) * imag(vec_am[(i - 1) * n + j - 1])
  const np_int nn = n * n;
#pragma omp parallel
  {
#pragma omp for reduction(+: vi[0:nddmst])
    for (np_int ij = 0; ij < nn; ij++) {
      np_int i = ij / n;
      np_int j = ij % n;
      vi[i] += (
        real(vec_am[i * n + j]) * real(vec_am[i * n + j])
        + imag(vec_am[i * n + j]) * imag(vec_am[i * n + j])
      );
    } // j1319 loop
    v[i - 1] = 1.0 / vi[i - 1];
  } // i1309 loop
  delete[] vi;
    }
#pragma omp for
    for (np_int i = 0; i < n; i++) v[i] = 1.0 / vi[i];
  } // OMP parallel region
  // 2. REPLACE AM BY TRIANGULAR MATRICES (L,U) WHERE AM=L*U.
  //    REPLACE L(I,I) BY 1/L(I,I), READY FOR SECTION 4.
  //    (ROW INTERCHANGES TAKE PLACE, AND THE INDICES OF THE PIVOTAL ROWS
@@ -1202,13 +1210,11 @@ void lucin(dcomplex **am, const np_int nddmst, np_int n, int &ier) {
    v[k - 1] = 1.0 * l;
    if (psqmax == 0.0) {
      ier = 1;
      delete[] v;
      return;
      // return ier;
    }
    dcomplex ctemp = 1.0 / vec_am[(k - 1) * n + k - 1];
    vec_am[(k - 1) * n + k - 1] = ctemp;
    if (kplus <= n) {
#pragma omp parallel for
      for (int64_t j = kplus; j <= n; j++) {
	dcomplex cfun = cdtp(-vec_am[(k - 1) * n + j - 1], vec_am, k, 1, j, kminus, n);
	vec_am[(k - 1) * n + j - 1] = -ctemp * cfun;
@@ -1253,7 +1259,9 @@ void lucin(dcomplex **am, const np_int nddmst, np_int n, int &ier) {
      } // i4319 loop
    }
  } // l4309 loop
  delete[] vi;
  delete[] v;
  return ier;
}

void mextc(double vk, double exri, dcomplex **fsac, double **cextlr, double **cext) {