Loading src/trapping/cfrfme.cpp +28 −60 Original line number Diff line number Diff line Loading @@ -65,10 +65,6 @@ #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; /*! \brief C++ implementation of FRFME Loading Loading @@ -399,95 +395,67 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX nvtxRangePush("j80 loop"); #endif int nkvs = nkv * nkv; int size_vec_wsum = nlmmt * nrvc; int size_global_vec_w = nkvs * (jlml - jlmf + 1); int size_vec_tt1_wk = nkvs * nlmmt; const dcomplex *vec_tt1_wk = tt1->wk; dcomplex *vec_wsum = tfrfme->wsum[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for #pragma omp target teams distribute parallel for \ map(tofrom: vec_wsum[0:size_vec_wsum]) \ map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) #else #pragma omp parallel for #endif for (int j80 = jlmf - 1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); // dcomplex *wk_local = new dcomplex[nlmmt](); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; int wk_index = 0; // for (int jy50 = 0; jy50 < nkv; jy50++) { // for (int jx50 = 0; jx50 < nkv; jx50++) { // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #else // #pragma omp parallel for simd // #endif dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); #pragma omp parallel for simd for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { // for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++]; // w[jx50][jy50] = wk_local[j80]; wk_index = nlmmt*jxy50; wk_value = tt1->wk[wk_index + j80]; // wk_index += nlmmt; int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; // w[jx50][jy50] = wk_value; } // jxy50 loop // } // jx50 // } // jy50 loop int ixyz = 0; #pragma omp parallel for simd for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0; int nvtot = nxv * nyv * nzv; int nvxy = nxv * nyv; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for // #else // #pragma omp parallel for // #endif #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; // for (int iz75 = 0; iz75 < nzv; iz75++) { double z = _zv[iz75] + frsh; // for (int iy70 = 0; iy70 < nyv; iy70++) { double y = _yv[iy70]; // for (int ix65 = 0; ix65 < nxv; ix65++) { double x = _xv[ix65]; // ixyz++; dcomplex sumy = cc0; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sumy) // #else // #pragma omp parallel for simd reduction(+:sumy) // #endif #pragma omp parallel for simd reduction(+:sumy) for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double vkx = (jx55 == 0) ? vkv[nkv - 1] : vkv[jx55]; double vkz = vec_vkzm[(jx55 * nkv) + jy60]; dcomplex phas = (jx55 == 0) ? cexp(uim * (-vkx * x + vky * y + vkz * z)): cexp(uim * (vkx * x + vky * y + vkz * z)); dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; dcomplex sumx = vec_w[w_index] * phas; double factor1 = ((jx55 == 0) || (jx55 == (nkv - 1))) ? 0.5 : 1.0; double factor2 = ((jy60 == 0) || (jy60 == (nkv - 1))) ? 0.5 : 1.0; sumx *= factor1*factor2; sumy += sumx; } // jy60x55 loop vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; // } // ix65 loop // } // iy70 loop // } // iz75 loop vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; } // ixyz loop free(vec_w); free(w); // delete[] wk_local; } // j80 loop delete[] global_vec_w; #ifdef USE_NVTX nvtxRangePop(); #endif Loading Loading
src/trapping/cfrfme.cpp +28 −60 Original line number Diff line number Diff line Loading @@ -65,10 +65,6 @@ #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; /*! \brief C++ implementation of FRFME Loading Loading @@ -399,95 +395,67 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX nvtxRangePush("j80 loop"); #endif int nkvs = nkv * nkv; int size_vec_wsum = nlmmt * nrvc; int size_global_vec_w = nkvs * (jlml - jlmf + 1); int size_vec_tt1_wk = nkvs * nlmmt; const dcomplex *vec_tt1_wk = tt1->wk; dcomplex *vec_wsum = tfrfme->wsum[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for #pragma omp target teams distribute parallel for \ map(tofrom: vec_wsum[0:size_vec_wsum]) \ map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) #else #pragma omp parallel for #endif for (int j80 = jlmf - 1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); // dcomplex *wk_local = new dcomplex[nlmmt](); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; int wk_index = 0; // for (int jy50 = 0; jy50 < nkv; jy50++) { // for (int jx50 = 0; jx50 < nkv; jx50++) { // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #else // #pragma omp parallel for simd // #endif dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); #pragma omp parallel for simd for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { // for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++]; // w[jx50][jy50] = wk_local[j80]; wk_index = nlmmt*jxy50; wk_value = tt1->wk[wk_index + j80]; // wk_index += nlmmt; int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; // w[jx50][jy50] = wk_value; } // jxy50 loop // } // jx50 // } // jy50 loop int ixyz = 0; #pragma omp parallel for simd for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0; int nvtot = nxv * nyv * nzv; int nvxy = nxv * nyv; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for // #else // #pragma omp parallel for // #endif #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; // for (int iz75 = 0; iz75 < nzv; iz75++) { double z = _zv[iz75] + frsh; // for (int iy70 = 0; iy70 < nyv; iy70++) { double y = _yv[iy70]; // for (int ix65 = 0; ix65 < nxv; ix65++) { double x = _xv[ix65]; // ixyz++; dcomplex sumy = cc0; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sumy) // #else // #pragma omp parallel for simd reduction(+:sumy) // #endif #pragma omp parallel for simd reduction(+:sumy) for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double vkx = (jx55 == 0) ? vkv[nkv - 1] : vkv[jx55]; double vkz = vec_vkzm[(jx55 * nkv) + jy60]; dcomplex phas = (jx55 == 0) ? cexp(uim * (-vkx * x + vky * y + vkz * z)): cexp(uim * (vkx * x + vky * y + vkz * z)); dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; dcomplex sumx = vec_w[w_index] * phas; double factor1 = ((jx55 == 0) || (jx55 == (nkv - 1))) ? 0.5 : 1.0; double factor2 = ((jy60 == 0) || (jy60 == (nkv - 1))) ? 0.5 : 1.0; sumx *= factor1*factor2; sumy += sumx; } // jy60x55 loop vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; // } // ix65 loop // } // iy70 loop // } // iz75 loop vec_wsum[(j80 * nrvc) + ixyz] = sumy * delks; } // ixyz loop free(vec_w); free(w); // delete[] wk_local; } // j80 loop delete[] global_vec_w; #ifdef USE_NVTX nvtxRangePop(); #endif Loading