Commit a3f8befc authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use lon long indices on large loops and vector access

parent 3e0025ff
Loading
Loading
Loading
Loading
+22 −21
Original line number Diff line number Diff line
@@ -628,24 +628,24 @@ void offload_loop(
  double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
  const int nkv, const int nlmmt, double delks, double frsh
) {
  int nvtot = nxv * nyv * nzv;
  int nkvs = nkv * nkv;
  int nkvmo = nkv - 1; 
  int nkvvmo = nkvmo * nkv;
  int nvxy = nxv * nyv;
  long long nvtot = nxv * nyv * nzv;
  long long nkvs = nkv * nkv;
  long long nkvmo = nkv - 1; 
  long long nkvvmo = nkvmo * nkv;
  long long nvxy = nxv * nyv;
  const dcomplex uim = 0.0 + I * 1.0;
  dcomplex cc0 = 0.0 + I * 0.0;

 // Inizializza global_vec_w e vec_wsum sulla CPU in parallelo
#pragma omp parallel for collapse(2)
  for (int j80 = jlmf - 1; j80 < jlml; j80++) {
    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
    for (long long jxy50 = 0; jxy50 < nkvs; jxy50++) {
      int j80_index = j80 - jlmf + 1;
      dcomplex *vec_w = global_vec_w + nkvs * j80_index;
      int wk_index = nlmmt * jxy50;
      long long wk_index = nlmmt * jxy50;
      dcomplex wk_value = vec_tt1_wk[wk_index + j80_index];
      int jy50 = jxy50 / nkv;
      int jx50 = jxy50 % nkv;
      long long jy50 = jxy50 / nkv;
      long long jx50 = jxy50 % nkv;
      vec_w[(nkv * jx50) + jy50] = wk_value;
    } // jxy50 loop
  }
@@ -660,22 +660,22 @@ void offload_loop(
  map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
  map(to: vkv[0:nkv], vec_vkzm[0:nkvs])
  {
    // Esegue il calcolo principale in un unico kernel GPU
    // con parallelizzazione sui cicli più grandi (nvtot e nkvs)
    // Run the main calculation in a single GPU kernel
    // with parallelization on the largest loops (nvtot & nkvs)
#pragma omp target teams distribute parallel for collapse(2)
    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
    for (long long ixyz = 0; ixyz < nvtot; ixyz++) {
      for (long long jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {

        int iz75 = ixyz / nvxy;
        int iy70 = (ixyz % nvxy) / nxv;
        int ix65 = ixyz % nxv;
        long long iz75 = ixyz / nvxy;
        long long iy70 = (ixyz % nvxy) / nxv;
        long long ix65 = ixyz % nxv;
        double z = _zv[iz75] + frsh;
        double y = _yv[iy70];
        double x = _xv[ix65];

        int jy60 = jy60x55 / nkv;
        int jx55 = jy60x55 % nkv;
        int w_index = (jx55 * nkv) + jy60;
        long long jy60 = jy60x55 / nkv;
        long long jx55 = jy60x55 % nkv;
        long long w_index = (jx55 * nkv) + jy60;
        double vky = vkv[jy60];
        double factor = (jy60 == 0 || jy60 == nkvmo) ? 0.5 : 1.0;
        double vkx, vkz;
@@ -709,11 +709,12 @@ void offload_loop(
          term = phas * factor;
        }
        
        // L'ultima loop è ora seriale, garantendo la correttezza dei risultati
	// The last loop can now be serialized, granting for correctness
	// of the results
        for (int j80 = jlmf - 1; j80 < jlml; j80++) {
          int j80_index = j80 - jlmf + 1;
          dcomplex *vec_w = global_vec_w + nkvs * j80_index;
          long wsum_index = (j80_index * nvtot) + ixyz;
          long long wsum_index = (j80_index * nvtot) + ixyz;
          vec_wsum[wsum_index] += delks * vec_w[w_index] * term;
        }
      } // jy60x55 loop