Loading src/trapping/cfrfme.cpp +23 −21 Original line number Diff line number Diff line Loading @@ -398,7 +398,8 @@ void frfme(string data_file, string output_path) { dcomplex *vec_wsum = tfrfme->wsum[0]; int size_wsum = nlmmt * nrvc; double *vec_vkzm = vkzm[0]; int size_vkzm = nkv * nkv; int nkvs = nkv * nkv; int size_vkzm = nkvs; const dcomplex *vec_tt1_wk = tt1->wk; int size_tt1_wk = nkv * nkv * nlmmt; int size_global_vec_w = nkv * nkv * (jlml - jlmf + 1); Loading @@ -412,34 +413,34 @@ void frfme(string data_file, string output_path) { device_id = omp_get_default_device(); global_vec_w = (dcomplex *)omp_target_alloc(size_global_vec_w * sizeof(dcomplex), device_id); global_w = (dcomplex **)omp_target_alloc(size_global_w * sizeof(dcomplex), device_id); #pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) \ map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: global_vec_w, global_w) \ firstprivate(jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks) #pragma omp target teams distribute parallel for \ map(tofrom: vec_wsum[0:size_wsum]) \ map(to: vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv]) \ map(to: _yv[0:nyv], _zv[0:nzv], global_vec_w, global_w) #else // Fall-back host work-space allocation global_vec_w = = new dcomplex[size_global_vec_w](); global_w = new dcomplex*[size_global_w]; #pragma omp parallel for simd #endif #pragma omp parallel for #endif // USE_TARGET_OFFLOAD for (int j80 = jlmf - 1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); dcomplex **w = global_w + nkv * (j80 - jlmf + 1); #pragma omp parallel for simd for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; int wk_index = 0; #pragma omp parallel for simd for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { wk_index = nlmmt * jxy50; wk_value = vec_tt1_wk[wk_index + j80]; int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop int ixyz = 0; #pragma omp parallel for simd for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0; int nvtot = nxv * nyv * nzv; int nvxy = nxv * nyv; #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; Loading @@ -448,6 +449,7 @@ void frfme(string data_file, string output_path) { double y = _yv[iy70]; double x = _xv[ix65]; dcomplex sumy = cc0; #pragma omp parallel for simd for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; Loading Loading
src/trapping/cfrfme.cpp +23 −21 Original line number Diff line number Diff line Loading @@ -398,7 +398,8 @@ void frfme(string data_file, string output_path) { dcomplex *vec_wsum = tfrfme->wsum[0]; int size_wsum = nlmmt * nrvc; double *vec_vkzm = vkzm[0]; int size_vkzm = nkv * nkv; int nkvs = nkv * nkv; int size_vkzm = nkvs; const dcomplex *vec_tt1_wk = tt1->wk; int size_tt1_wk = nkv * nkv * nlmmt; int size_global_vec_w = nkv * nkv * (jlml - jlmf + 1); Loading @@ -412,34 +413,34 @@ void frfme(string data_file, string output_path) { device_id = omp_get_default_device(); global_vec_w = (dcomplex *)omp_target_alloc(size_global_vec_w * sizeof(dcomplex), device_id); global_w = (dcomplex **)omp_target_alloc(size_global_w * sizeof(dcomplex), device_id); #pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) \ map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: global_vec_w, global_w) \ firstprivate(jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks) #pragma omp target teams distribute parallel for \ map(tofrom: vec_wsum[0:size_wsum]) \ map(to: vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv]) \ map(to: _yv[0:nyv], _zv[0:nzv], global_vec_w, global_w) #else // Fall-back host work-space allocation global_vec_w = = new dcomplex[size_global_vec_w](); global_w = new dcomplex*[size_global_w]; #pragma omp parallel for simd #endif #pragma omp parallel for #endif // USE_TARGET_OFFLOAD for (int j80 = jlmf - 1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); dcomplex **w = global_w + nkv * (j80 - jlmf + 1); #pragma omp parallel for simd for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; int wk_index = 0; #pragma omp parallel for simd for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { wk_index = nlmmt * jxy50; wk_value = vec_tt1_wk[wk_index + j80]; int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop int ixyz = 0; #pragma omp parallel for simd for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0; int nvtot = nxv * nyv * nzv; int nvxy = nxv * nyv; #pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; Loading @@ -448,6 +449,7 @@ void frfme(string data_file, string output_path) { double y = _yv[iy70]; double x = _xv[ix65]; dcomplex sumy = cc0; #pragma omp parallel for simd for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; Loading