Loading src/trapping/cfrfme.cpp +80 −44 Original line number Original line Diff line number Diff line Loading @@ -61,6 +61,14 @@ #include <nvtx3/nvToolsExt.h> #include <nvtx3/nvToolsExt.h> #endif #endif #ifdef _OPENMP #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; using namespace std; /*! \brief C++ implementation of FRFME /*! \brief C++ implementation of FRFME Loading Loading @@ -391,57 +399,84 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("j80 loop"); nvtxRangePush("j80 loop"); #endif #endif #pragma omp parallel for dcomplex *vec_wsum = tfrfme->wsum[0]; for (int j80 = jlmf; j80 <= jlml; j80++) { double *vec_vkzm = vkzm[0]; dcomplex *vec_w = new dcomplex[nkv * nkv](); #ifdef USE_TARGET_OFFLOAD dcomplex **w = new dcomplex*[nkv]; #pragma omp target teams distribute parallel for simd #endif // #pragma omp parallel for for (int j80 = jlmf-1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); // dcomplex *wk_local = new dcomplex[nlmmt](); // dcomplex *wk_local = new dcomplex[nlmmt](); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; dcomplex wk_value; int wk_index = 0; int wk_index = 0; int nkvs = nkv * nkv; // for (int jy50 = 0; jy50 < nkv; jy50++) { for (int jy50 = 0; jy50 < nkv; jy50++) { // for (int jx50 = 0; jx50 < nkv; jx50++) { for (int jx50 = 0; jx50 < nkv; jx50++) { // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #endif // #pragma omp parallel for for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { // for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++]; // for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++]; // w[jx50][jy50] = wk_local[j80 - 1]; // w[jx50][jy50] = wk_local[j80]; wk_value = tt1->wk[wk_index + j80 - 1]; wk_index = nlmmt*jxy50; wk_index += nlmmt; wk_value = tt1->wk[wk_index + j80]; w[jx50][jy50] = wk_value; // wk_index += nlmmt; } // jx50 int jy50 = jxy50 / nkv; } // jy50 loop int jx50 = jxy50 % nkv; vec_w[(nkv*jx50) + jy50] = wk_value; // w[jx50][jy50] = wk_value; } // jxy50 loop // } // jx50 // } // jy50 loop int ixyz = 0; int ixyz = 0; for (int wj = 0; wj < nrvc; wj++) tfrfme->wsum[j80 - 1][wj] = cc0; for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0; for (int iz75 = 0; iz75 < nzv; iz75++) { int nvtot = nxv*nyv*nzv; int nvxy = nxv *nyv; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for // #endif // #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; // for (int iz75 = 0; iz75 < nzv; iz75++) { double z = _zv[iz75] + frsh; double z = _zv[iz75] + frsh; for (int iy70 = 0; iy70 < nyv; iy70++) { // for (int iy70 = 0; iy70 < nyv; iy70++) { double y = _yv[iy70]; double y = _yv[iy70]; for (int ix65 = 0; ix65 < nxv; ix65++) { // for (int ix65 = 0; ix65 < nxv; ix65++) { double x = _xv[ix65]; double x = _xv[ix65]; ixyz++; // ixyz++; dcomplex sumy = cc0; dcomplex sumy = cc0; for (int jy60 = 0; jy60 < nkv; jy60++) { // #ifdef USE_TARGET_OFFLOAD // #pragma omp target parallel for simd reduction(+:sumy) // #endif for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; double vky = vkv[jy60]; double vky = vkv[jy60]; double vkx = vkv[nkv - 1]; double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55]; double vkzf = vkzm[0][jy60]; double vkz = vec_vkzm[(jx55*nkv)+jy60]; dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z)); dcomplex phas = (jx55==0) ? double vkzl = vkzm[nkv - 1][jy60]; cexp(uim * (-vkx * x + vky * y + vkz * z)): dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z)); cexp(uim * (vkx * x + vky * y + vkz * z)); dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl); dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; for (int jx55 = 2; jx55 <= nks; jx55++) { double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0; vkx = vkv[jx55 - 1]; double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0; double vkz = vkzm[jx55 - 1][jy60]; sumx *= factor1*factor2; dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z)); sumx += (w[jx55 - 1][jy60] * phas); } // jx55 loop if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5; sumy += sumx; sumy += sumx; } // jy60 loop } // jy60x55 loop tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks; vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; } // ix65 loop // } // ix65 loop } // iy70 loop // } // iy70 loop } // iz75 loop // } // iz75 loop delete[] vec_w; } // ixyz loop delete[] w; free(vec_w); free(w); // delete[] wk_local; // delete[] wk_local; } // j80 loop } // j80 loop #ifdef USE_NVTX #ifdef USE_NVTX Loading Loading @@ -499,3 +534,4 @@ void frfme(string data_file, string output_path) { nvtxRangePop(); nvtxRangePop(); #endif #endif } } Loading
src/trapping/cfrfme.cpp +80 −44 Original line number Original line Diff line number Diff line Loading @@ -61,6 +61,14 @@ #include <nvtx3/nvToolsExt.h> #include <nvtx3/nvToolsExt.h> #endif #endif #ifdef _OPENMP #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; using namespace std; /*! \brief C++ implementation of FRFME /*! \brief C++ implementation of FRFME Loading Loading @@ -391,57 +399,84 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("j80 loop"); nvtxRangePush("j80 loop"); #endif #endif #pragma omp parallel for dcomplex *vec_wsum = tfrfme->wsum[0]; for (int j80 = jlmf; j80 <= jlml; j80++) { double *vec_vkzm = vkzm[0]; dcomplex *vec_w = new dcomplex[nkv * nkv](); #ifdef USE_TARGET_OFFLOAD dcomplex **w = new dcomplex*[nkv]; #pragma omp target teams distribute parallel for simd #endif // #pragma omp parallel for for (int j80 = jlmf-1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); // dcomplex *wk_local = new dcomplex[nlmmt](); // dcomplex *wk_local = new dcomplex[nlmmt](); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; dcomplex wk_value; int wk_index = 0; int wk_index = 0; int nkvs = nkv * nkv; // for (int jy50 = 0; jy50 < nkv; jy50++) { for (int jy50 = 0; jy50 < nkv; jy50++) { // for (int jx50 = 0; jx50 < nkv; jx50++) { for (int jx50 = 0; jx50 < nkv; jx50++) { // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #endif // #pragma omp parallel for for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { // for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++]; // for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++]; // w[jx50][jy50] = wk_local[j80 - 1]; // w[jx50][jy50] = wk_local[j80]; wk_value = tt1->wk[wk_index + j80 - 1]; wk_index = nlmmt*jxy50; wk_index += nlmmt; wk_value = tt1->wk[wk_index + j80]; w[jx50][jy50] = wk_value; // wk_index += nlmmt; } // jx50 int jy50 = jxy50 / nkv; } // jy50 loop int jx50 = jxy50 % nkv; vec_w[(nkv*jx50) + jy50] = wk_value; // w[jx50][jy50] = wk_value; } // jxy50 loop // } // jx50 // } // jy50 loop int ixyz = 0; int ixyz = 0; for (int wj = 0; wj < nrvc; wj++) tfrfme->wsum[j80 - 1][wj] = cc0; for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0; for (int iz75 = 0; iz75 < nzv; iz75++) { int nvtot = nxv*nyv*nzv; int nvxy = nxv *nyv; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for // #endif // #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; // for (int iz75 = 0; iz75 < nzv; iz75++) { double z = _zv[iz75] + frsh; double z = _zv[iz75] + frsh; for (int iy70 = 0; iy70 < nyv; iy70++) { // for (int iy70 = 0; iy70 < nyv; iy70++) { double y = _yv[iy70]; double y = _yv[iy70]; for (int ix65 = 0; ix65 < nxv; ix65++) { // for (int ix65 = 0; ix65 < nxv; ix65++) { double x = _xv[ix65]; double x = _xv[ix65]; ixyz++; // ixyz++; dcomplex sumy = cc0; dcomplex sumy = cc0; for (int jy60 = 0; jy60 < nkv; jy60++) { // #ifdef USE_TARGET_OFFLOAD // #pragma omp target parallel for simd reduction(+:sumy) // #endif for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; double vky = vkv[jy60]; double vky = vkv[jy60]; double vkx = vkv[nkv - 1]; double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55]; double vkzf = vkzm[0][jy60]; double vkz = vec_vkzm[(jx55*nkv)+jy60]; dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z)); dcomplex phas = (jx55==0) ? double vkzl = vkzm[nkv - 1][jy60]; cexp(uim * (-vkx * x + vky * y + vkz * z)): dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z)); cexp(uim * (vkx * x + vky * y + vkz * z)); dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl); dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; for (int jx55 = 2; jx55 <= nks; jx55++) { double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0; vkx = vkv[jx55 - 1]; double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0; double vkz = vkzm[jx55 - 1][jy60]; sumx *= factor1*factor2; dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z)); sumx += (w[jx55 - 1][jy60] * phas); } // jx55 loop if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5; sumy += sumx; sumy += sumx; } // jy60 loop } // jy60x55 loop tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks; vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; } // ix65 loop // } // ix65 loop } // iy70 loop // } // iy70 loop } // iz75 loop // } // iz75 loop delete[] vec_w; } // ixyz loop delete[] w; free(vec_w); free(w); // delete[] wk_local; // delete[] wk_local; } // j80 loop } // j80 loop #ifdef USE_NVTX #ifdef USE_NVTX Loading Loading @@ -499,3 +534,4 @@ void frfme(string data_file, string output_path) { nvtxRangePop(); nvtxRangePop(); #endif #endif } }