Loading src/trapping/cfrfme.cpp +22 −21 Original line number Diff line number Diff line Loading @@ -628,24 +628,24 @@ void offload_loop( double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt, double delks, double frsh ) { int nvtot = nxv * nyv * nzv; int nkvs = nkv * nkv; int nkvmo = nkv - 1; int nkvvmo = nkvmo * nkv; int nvxy = nxv * nyv; long long nvtot = nxv * nyv * nzv; long long nkvs = nkv * nkv; long long nkvmo = nkv - 1; long long nkvvmo = nkvmo * nkv; long long nvxy = nxv * nyv; const dcomplex uim = 0.0 + I * 1.0; dcomplex cc0 = 0.0 + I * 0.0; // Inizializza global_vec_w e vec_wsum sulla CPU in parallelo #pragma omp parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { for (long long jxy50 = 0; jxy50 < nkvs; jxy50++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; int wk_index = nlmmt * jxy50; long long wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80_index]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; long long jy50 = jxy50 / nkv; long long jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop } Loading @@ -660,22 +660,22 @@ void offload_loop( map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) { // Esegue il calcolo principale in un unico kernel GPU // con parallelizzazione sui cicli più grandi (nvtot e nkvs) // Run the main calculation in a single GPU kernel // with parallelization on the largest loops (nvtot & nkvs) #pragma omp target teams distribute parallel for collapse(2) for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { for (long long ixyz = 0; ixyz < nvtot; ixyz++) { for (long long jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; long long iz75 = ixyz / nvxy; long long iy70 = (ixyz % nvxy) / nxv; long long ix65 = ixyz % nxv; double z = _zv[iz75] + frsh; double y = _yv[iy70]; double x = _xv[ix65]; int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; long long jy60 = jy60x55 / nkv; long long jx55 = jy60x55 % nkv; long long w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double factor = (jy60 == 0 || jy60 == nkvmo) ? 0.5 : 1.0; double vkx, vkz; Loading Loading @@ -709,11 +709,12 @@ void offload_loop( term = phas * factor; } // L'ultima loop è ora seriale, garantendo la correttezza dei risultati // The last loop can now be serialized, granting for correctness // of the results for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; long wsum_index = (j80_index * nvtot) + ixyz; long long wsum_index = (j80_index * nvtot) + ixyz; vec_wsum[wsum_index] += delks * vec_w[w_index] * term; } } // jy60x55 loop Loading Loading
src/trapping/cfrfme.cpp +22 −21 Original line number Diff line number Diff line Loading @@ -628,24 +628,24 @@ void offload_loop( double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt, double delks, double frsh ) { int nvtot = nxv * nyv * nzv; int nkvs = nkv * nkv; int nkvmo = nkv - 1; int nkvvmo = nkvmo * nkv; int nvxy = nxv * nyv; long long nvtot = nxv * nyv * nzv; long long nkvs = nkv * nkv; long long nkvmo = nkv - 1; long long nkvvmo = nkvmo * nkv; long long nvxy = nxv * nyv; const dcomplex uim = 0.0 + I * 1.0; dcomplex cc0 = 0.0 + I * 0.0; // Inizializza global_vec_w e vec_wsum sulla CPU in parallelo #pragma omp parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { for (long long jxy50 = 0; jxy50 < nkvs; jxy50++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; int wk_index = nlmmt * jxy50; long long wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80_index]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; long long jy50 = jxy50 / nkv; long long jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop } Loading @@ -660,22 +660,22 @@ void offload_loop( map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) { // Esegue il calcolo principale in un unico kernel GPU // con parallelizzazione sui cicli più grandi (nvtot e nkvs) // Run the main calculation in a single GPU kernel // with parallelization on the largest loops (nvtot & nkvs) #pragma omp target teams distribute parallel for collapse(2) for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { for (long long ixyz = 0; ixyz < nvtot; ixyz++) { for (long long jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; long long iz75 = ixyz / nvxy; long long iy70 = (ixyz % nvxy) / nxv; long long ix65 = ixyz % nxv; double z = _zv[iz75] + frsh; double y = _yv[iy70]; double x = _xv[ix65]; int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; long long jy60 = jy60x55 / nkv; long long jx55 = jy60x55 % nkv; long long w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double factor = (jy60 == 0 || jy60 == nkvmo) ? 0.5 : 1.0; double vkx, vkz; Loading Loading @@ -709,11 +709,12 @@ void offload_loop( term = phas * factor; } // L'ultima loop è ora seriale, garantendo la correttezza dei risultati // The last loop can now be serialized, granting for correctness // of the results for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; long wsum_index = (j80_index * nvtot) + ixyz; long long wsum_index = (j80_index * nvtot) + ixyz; vec_wsum[wsum_index] += delks * vec_w[w_index] * term; } } // jy60x55 loop Loading