Commit a4af2237 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Flatten j80 loop and add logging on number of J iterations

parent 43d18f76
Loading
Loading
Loading
Loading
+25 −26
Original line number Diff line number Diff line
@@ -467,6 +467,8 @@ void frfme(string data_file, string output_path) {
	  dcomplex *vec_wsum = tfrfme->wsum[0];
	  double *vec_vkzm = vkzm[0];
	  dcomplex *global_vec_w = new dcomplex[size_global_vec_w];
	  message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n";
	  logger.log(message);
#ifdef USE_TARGET_OFFLOAD
	  t_end = chrono::high_resolution_clock::now();
	  elapsed = t_start - t_end;
@@ -664,10 +666,16 @@ void offload_loop(
      } // jxy50 loop
    }

    // Kernel 2: run the calculation
#pragma omp target teams distribute parallel for collapse(2)
    for (int j80 = jlmf - 1; j80 < jlml; j80++) {
    // Kernel 2: ensure that vec_wsum is initialized at 0
#pragma omp target teams distribute parallel for
    for (long i = 0; i < size_vec_wsum; i++)
      vec_wsum[i] = cc0;

    // Kernel 3: run the calculation
#pragma omp target teams distribute parallel for collapse(3)
    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
	for (int j80 = jlmf - 1; j80 < jlml; j80++) {
	  int j80_index = j80 - jlmf + 1;
	  dcomplex *vec_w = global_vec_w + nkvs * j80_index;
	  int iz75 = ixyz / nvxy;
@@ -676,15 +684,12 @@ void offload_loop(
	  double z = _zv[iz75] + frsh;
	  double y = _yv[iy70];
	  double x = _xv[ix65];
	double rsumy = 0.0;
	double isumy = 0.0;
#pragma omp parallel for reduction(+:rsumy, isumy)
	for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
	  int jy60 = jy60x55 / nkv;
	  int jx55 = jy60x55 % nkv;
	  int w_index = (jx55 * nkv) + jy60;
	  double vky = vkv[jy60];
	  double factor = (jy60 == 0 || jy60 == nkvmo) ? 0.5 : 1.0;
	  long wsum_index = (j80_index * nvtot) + ixyz;
	  if (jx55 == 0) {
	    // jx55 = 0: phasf
	    double vkx = vkv[nkvmo];
@@ -695,8 +700,7 @@ void offload_loop(
	    dcomplex phasf = c + uim * s;
	    dcomplex term = vec_w[jy60] * phasf * 0.5;
	    term *= factor;
	    rsumy += (real(term));
	    isumy += (imag(term));
	    vec_wsum[wsum_index] += term * delks;
	  } else if (jx55 == nkvmo) {
	    // jx55 = nkv - 1: phasl
	    double vkx = vkv[nkvmo];
@@ -707,8 +711,7 @@ void offload_loop(
	    dcomplex phasl = c + uim * s;
	    dcomplex term = vec_w[nkvvmo + jy60] * phasl * 0.5;
	    term *= factor;
	    rsumy += (real(term));
	    isumy += (imag(term));
	    vec_wsum[wsum_index] += term * delks;
	  } else {
	    // 1 <= jx55 < nkv - 1
	    double vkx = vkv[jx55];
@@ -719,15 +722,11 @@ void offload_loop(
	    dcomplex phas = c + uim * s;
	    dcomplex term = vec_w[w_index] * phas;
	    term *= factor;
	    rsumy += (real(term));
	    isumy += (imag(term));
	    vec_wsum[wsum_index] += term * delks;
	  }
	} // jy60x55 loop
	dcomplex sumy = rsumy + uim * isumy;
	vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks;
      } // ixyz loop
    } // j80 loop
    
  } // target region
}
#endif // USE TARGET_OFFLOAD