Loading src/trapping/cfrfme.cpp +78 −6 Original line number Original line Diff line number Diff line Loading @@ -65,6 +65,15 @@ #include <omp.h> #include <omp.h> #endif #endif #ifdef USE_TARGET_OFFLOAD void offload_loop( dcomplex *vec_wsum, int size_vec_wsum, dcomplex *global_vec_w, int size_global_vec_w, const dcomplex *vec_tt1_wk, int size_vec_tt1_wk, double *vkv, double *_xv, int nxv, double *_yv, int nyv, double *_zv, int nzv, double *vec_vkzm, int jlmf, int jlml, int nkv, int nlmmt, double delks, double frsh ); #endif using namespace std; using namespace std; /*! \brief C++ implementation of FRFME /*! \brief C++ implementation of FRFME Loading Loading @@ -404,14 +413,13 @@ void frfme(string data_file, string output_path) { double *vec_vkzm = vkzm[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for \ offload_loop( map(tofrom: vec_wsum[0:size_vec_wsum]) \ vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, map(alloc: global_vec_w[0:size_global_vec_w]) \ size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ nkv, nlmmt, delks, frsh map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) ); #else #else #pragma omp parallel for #pragma omp parallel for #endif for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int j80 = jlmf - 1; j80 < jlml; j80++) { dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); #pragma omp parallel for simd #pragma omp parallel for simd Loading Loading @@ -455,6 +463,7 @@ void frfme(string data_file, string output_path) { vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; } // ixyz loop } // ixyz loop } // j80 loop } // j80 loop #endif // USE_TARGET_OFFLOAD delete[] global_vec_w; delete[] global_vec_w; #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); Loading Loading @@ -512,3 +521,66 @@ void frfme(string data_file, string output_path) { #endif #endif } } #ifdef USE_TARGET_OFFLOAD void offload_loop( dcomplex *vec_wsum, int size_vec_wsum, dcomplex *global_vec_w, int size_global_vec_w, const dcomplex *vec_tt1_wk, int size_vec_tt1_wk, double *vkv, double *_xv, int nxv, double *_yv, int nyv, double *_zv, int nzv, double *vec_vkzm, int jlmf, int jlml, int nkv, int nlmmt, double delks, double frsh ) { int nrvc = nxv * nyv * nzv; int nkvs = nkv * nkv; const dcomplex cc0 = 0.0 + I * 0.0; const dcomplex uim = 0.0 + I * 1.0; #pragma omp target teams distribute parallel for \ map(tofrom: vec_wsum[0:size_vec_wsum]) \ map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) for (int j80 = jlmf - 1; j80 < jlml; j80++) { dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); #pragma omp parallel for simd for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop #pragma omp parallel for simd for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0; int nvtot = nxv * nyv * nzv; int nvxy = nxv * nyv; #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; double z = _zv[iz75] + frsh; double y = _yv[iy70]; double x = _xv[ix65]; dcomplex sumy = cc0; #pragma omp parallel for simd reduction(+:sumy) for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double vkx = (jx55 == 0) ? vkv[nkv - 1] : vkv[jx55]; double vkz = vec_vkzm[(jx55 * nkv) + jy60]; double sign = (jx55 == 0) ? -1.0 : 1.0; double rpart = cos(sign * vkx * x + vky * y + vkz * z); double ipart = sin(vkx * x + vky * y + vkz * z); dcomplex phas = rpart + I * ipart; // dcomplex phas = cexp(uim * (sign * vkx * x + vky * y + vkz * z)); dcomplex sumx = vec_w[w_index] * phas; double factor1 = ((jx55 == 0) || (jx55 == (nkv - 1))) ? 0.5 : 1.0; double factor2 = ((jy60 == 0) || (jy60 == (nkv - 1))) ? 0.5 : 1.0; sumx *= factor1*factor2; sumy += sumx; } // jy60x55 loop vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; } // ixyz loop } // j80 loop } #endif // USE TARGET_OFFLOAD Loading
src/trapping/cfrfme.cpp +78 −6 Original line number Original line Diff line number Diff line Loading @@ -65,6 +65,15 @@ #include <omp.h> #include <omp.h> #endif #endif #ifdef USE_TARGET_OFFLOAD void offload_loop( dcomplex *vec_wsum, int size_vec_wsum, dcomplex *global_vec_w, int size_global_vec_w, const dcomplex *vec_tt1_wk, int size_vec_tt1_wk, double *vkv, double *_xv, int nxv, double *_yv, int nyv, double *_zv, int nzv, double *vec_vkzm, int jlmf, int jlml, int nkv, int nlmmt, double delks, double frsh ); #endif using namespace std; using namespace std; /*! \brief C++ implementation of FRFME /*! \brief C++ implementation of FRFME Loading Loading @@ -404,14 +413,13 @@ void frfme(string data_file, string output_path) { double *vec_vkzm = vkzm[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for \ offload_loop( map(tofrom: vec_wsum[0:size_vec_wsum]) \ vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, map(alloc: global_vec_w[0:size_global_vec_w]) \ size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ nkv, nlmmt, delks, frsh map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) ); #else #else #pragma omp parallel for #pragma omp parallel for #endif for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int j80 = jlmf - 1; j80 < jlml; j80++) { dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); #pragma omp parallel for simd #pragma omp parallel for simd Loading Loading @@ -455,6 +463,7 @@ void frfme(string data_file, string output_path) { vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; } // ixyz loop } // ixyz loop } // j80 loop } // j80 loop #endif // USE_TARGET_OFFLOAD delete[] global_vec_w; delete[] global_vec_w; #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); Loading Loading @@ -512,3 +521,66 @@ void frfme(string data_file, string output_path) { #endif #endif } } #ifdef USE_TARGET_OFFLOAD void offload_loop( dcomplex *vec_wsum, int size_vec_wsum, dcomplex *global_vec_w, int size_global_vec_w, const dcomplex *vec_tt1_wk, int size_vec_tt1_wk, double *vkv, double *_xv, int nxv, double *_yv, int nyv, double *_zv, int nzv, double *vec_vkzm, int jlmf, int jlml, int nkv, int nlmmt, double delks, double frsh ) { int nrvc = nxv * nyv * nzv; int nkvs = nkv * nkv; const dcomplex cc0 = 0.0 + I * 0.0; const dcomplex uim = 0.0 + I * 1.0; #pragma omp target teams distribute parallel for \ map(tofrom: vec_wsum[0:size_vec_wsum]) \ map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) for (int j80 = jlmf - 1; j80 < jlml; j80++) { dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); #pragma omp parallel for simd for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop #pragma omp parallel for simd for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0; int nvtot = nxv * nyv * nzv; int nvxy = nxv * nyv; #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; double z = _zv[iz75] + frsh; double y = _yv[iy70]; double x = _xv[ix65]; dcomplex sumy = cc0; #pragma omp parallel for simd reduction(+:sumy) for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double vkx = (jx55 == 0) ? vkv[nkv - 1] : vkv[jx55]; double vkz = vec_vkzm[(jx55 * nkv) + jy60]; double sign = (jx55 == 0) ? -1.0 : 1.0; double rpart = cos(sign * vkx * x + vky * y + vkz * z); double ipart = sin(vkx * x + vky * y + vkz * z); dcomplex phas = rpart + I * ipart; // dcomplex phas = cexp(uim * (sign * vkx * x + vky * y + vkz * z)); dcomplex sumx = vec_w[w_index] * phas; double factor1 = ((jx55 == 0) || (jx55 == (nkv - 1))) ? 0.5 : 1.0; double factor2 = ((jy60 == 0) || (jy60 == (nkv - 1))) ? 0.5 : 1.0; sumx *= factor1*factor2; sumy += sumx; } // jy60x55 loop vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; } // ixyz loop } // j80 loop } #endif // USE TARGET_OFFLOAD