Loading src/trapping/cfrfme.cpp +25 −26 Original line number Diff line number Diff line Loading @@ -467,6 +467,8 @@ void frfme(string data_file, string output_path) { dcomplex *vec_wsum = tfrfme->wsum[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n"; logger.log(message); #ifdef USE_TARGET_OFFLOAD t_end = chrono::high_resolution_clock::now(); elapsed = t_start - t_end; Loading Loading @@ -664,10 +666,16 @@ void offload_loop( } // jxy50 loop } // Kernel 2: run the calculation #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { // Kernel 2: ensure that vec_wsum is initialized at 0 #pragma omp target teams distribute parallel for for (long i = 0; i < size_vec_wsum; i++) vec_wsum[i] = cc0; // Kernel 3: run the calculation #pragma omp target teams distribute parallel for collapse(3) for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; int iz75 = ixyz / nvxy; Loading @@ -676,15 +684,12 @@ void offload_loop( double z = _zv[iz75] + frsh; double y = _yv[iy70]; double x = _xv[ix65]; double rsumy = 0.0; double isumy = 0.0; #pragma omp parallel for reduction(+:rsumy, isumy) for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double factor = (jy60 == 0 || jy60 == nkvmo) ? 0.5 : 1.0; long wsum_index = (j80_index * nvtot) + ixyz; if (jx55 == 0) { // jx55 = 0: phasf double vkx = vkv[nkvmo]; Loading @@ -695,8 +700,7 @@ void offload_loop( dcomplex phasf = c + uim * s; dcomplex term = vec_w[jy60] * phasf * 0.5; term *= factor; rsumy += (real(term)); isumy += (imag(term)); vec_wsum[wsum_index] += term * delks; } else if (jx55 == nkvmo) { // jx55 = nkv - 1: phasl double vkx = vkv[nkvmo]; Loading @@ -707,8 +711,7 @@ void offload_loop( dcomplex phasl = c + uim * s; dcomplex term = vec_w[nkvvmo + jy60] * phasl * 0.5; term *= factor; rsumy += (real(term)); isumy += (imag(term)); vec_wsum[wsum_index] += term * delks; } else { // 1 <= jx55 < nkv - 1 double vkx = vkv[jx55]; Loading @@ -719,15 +722,11 @@ void offload_loop( dcomplex phas = c + uim * s; dcomplex term = vec_w[w_index] * phas; term *= factor; rsumy += (real(term)); isumy += (imag(term)); vec_wsum[wsum_index] += term * delks; } } // jy60x55 loop dcomplex sumy = rsumy + uim * isumy; vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks; } // ixyz loop } // j80 loop } // target region } #endif // USE TARGET_OFFLOAD Loading
src/trapping/cfrfme.cpp +25 −26 Original line number Diff line number Diff line Loading @@ -467,6 +467,8 @@ void frfme(string data_file, string output_path) { dcomplex *vec_wsum = tfrfme->wsum[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n"; logger.log(message); #ifdef USE_TARGET_OFFLOAD t_end = chrono::high_resolution_clock::now(); elapsed = t_start - t_end; Loading Loading @@ -664,10 +666,16 @@ void offload_loop( } // jxy50 loop } // Kernel 2: run the calculation #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { // Kernel 2: ensure that vec_wsum is initialized at 0 #pragma omp target teams distribute parallel for for (long i = 0; i < size_vec_wsum; i++) vec_wsum[i] = cc0; // Kernel 3: run the calculation #pragma omp target teams distribute parallel for collapse(3) for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; int iz75 = ixyz / nvxy; Loading @@ -676,15 +684,12 @@ void offload_loop( double z = _zv[iz75] + frsh; double y = _yv[iy70]; double x = _xv[ix65]; double rsumy = 0.0; double isumy = 0.0; #pragma omp parallel for reduction(+:rsumy, isumy) for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; int w_index = (jx55 * nkv) + jy60; double vky = vkv[jy60]; double factor = (jy60 == 0 || jy60 == nkvmo) ? 0.5 : 1.0; long wsum_index = (j80_index * nvtot) + ixyz; if (jx55 == 0) { // jx55 = 0: phasf double vkx = vkv[nkvmo]; Loading @@ -695,8 +700,7 @@ void offload_loop( dcomplex phasf = c + uim * s; dcomplex term = vec_w[jy60] * phasf * 0.5; term *= factor; rsumy += (real(term)); isumy += (imag(term)); vec_wsum[wsum_index] += term * delks; } else if (jx55 == nkvmo) { // jx55 = nkv - 1: phasl double vkx = vkv[nkvmo]; Loading @@ -707,8 +711,7 @@ void offload_loop( dcomplex phasl = c + uim * s; dcomplex term = vec_w[nkvvmo + jy60] * phasl * 0.5; term *= factor; rsumy += (real(term)); isumy += (imag(term)); vec_wsum[wsum_index] += term * delks; } else { // 1 <= jx55 < nkv - 1 double vkx = vkv[jx55]; Loading @@ -719,15 +722,11 @@ void offload_loop( dcomplex phas = c + uim * s; dcomplex term = vec_w[w_index] * phas; term *= factor; rsumy += (real(term)); isumy += (imag(term)); vec_wsum[wsum_index] += term * delks; } } // jy60x55 loop dcomplex sumy = rsumy + uim * isumy; vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks; } // ixyz loop } // j80 loop } // target region } #endif // USE TARGET_OFFLOAD