Loading src/trapping/cfrfme.cpp +20 −26 Original line number Original line Diff line number Diff line Loading @@ -466,7 +466,7 @@ void frfme(string data_file, string output_path) { const dcomplex *vec_tt1_wk = tt1->wk; const dcomplex *vec_tt1_wk = tt1->wk; dcomplex *vec_wsum = tfrfme->wsum[0]; dcomplex *vec_wsum = tfrfme->wsum[0]; double *vec_vkzm = vkzm[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w](); message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n"; message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n"; logger.log(message); logger.log(message); #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD Loading Loading @@ -617,12 +617,12 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif #endif #ifndef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD elapsed = chrono::high_resolution_clock::now() - t_start; frfme_duration = elapsed; #else elapsed = chrono::high_resolution_clock::now() - t_end; elapsed = chrono::high_resolution_clock::now() - t_end; frfme_duration += elapsed; frfme_duration += elapsed; #else elapsed = chrono::high_resolution_clock::now() - t_start; frfme_duration = elapsed; #endif #endif message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n"; message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n"; logger.log(message); logger.log(message); Loading @@ -646,14 +646,7 @@ void offload_loop( dcomplex cc0 = 0.0 + I * 0.0; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; dcomplex uim = 0.0 + I * 1.0; #pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \ #pragma omp parallel for simd collapse(2) map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) { // Kernel 1: prepare the data on the device #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int j80_index = j80 - jlmf + 1; int j80_index = j80 - jlmf + 1; Loading @@ -666,12 +659,13 @@ void offload_loop( } // jxy50 loop } // jxy50 loop } } // Kernel 2: ensure that vec_wsum is initialized at 0 #pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \ #pragma omp target teams distribute parallel for map(to: global_vec_w[0:size_global_vec_w]) \ for (long i = 0; i < size_vec_wsum; i++) map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ vec_wsum[i] = cc0; map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) // Kernel 3: run the calculation { // Kernel 1: run the calculation #pragma omp target teams distribute parallel for collapse(3) #pragma omp target teams distribute parallel for collapse(3) for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { Loading Loading
src/trapping/cfrfme.cpp +20 −26 Original line number Original line Diff line number Diff line Loading @@ -466,7 +466,7 @@ void frfme(string data_file, string output_path) { const dcomplex *vec_tt1_wk = tt1->wk; const dcomplex *vec_tt1_wk = tt1->wk; dcomplex *vec_wsum = tfrfme->wsum[0]; dcomplex *vec_wsum = tfrfme->wsum[0]; double *vec_vkzm = vkzm[0]; double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w](); message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n"; message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n"; logger.log(message); logger.log(message); #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD Loading Loading @@ -617,12 +617,12 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif #endif #ifndef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD elapsed = chrono::high_resolution_clock::now() - t_start; frfme_duration = elapsed; #else elapsed = chrono::high_resolution_clock::now() - t_end; elapsed = chrono::high_resolution_clock::now() - t_end; frfme_duration += elapsed; frfme_duration += elapsed; #else elapsed = chrono::high_resolution_clock::now() - t_start; frfme_duration = elapsed; #endif #endif message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n"; message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n"; logger.log(message); logger.log(message); Loading @@ -646,14 +646,7 @@ void offload_loop( dcomplex cc0 = 0.0 + I * 0.0; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; dcomplex uim = 0.0 + I * 1.0; #pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \ #pragma omp parallel for simd collapse(2) map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) { // Kernel 1: prepare the data on the device #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int j80_index = j80 - jlmf + 1; int j80_index = j80 - jlmf + 1; Loading @@ -666,12 +659,13 @@ void offload_loop( } // jxy50 loop } // jxy50 loop } } // Kernel 2: ensure that vec_wsum is initialized at 0 #pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \ #pragma omp target teams distribute parallel for map(to: global_vec_w[0:size_global_vec_w]) \ for (long i = 0; i < size_vec_wsum; i++) map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ vec_wsum[i] = cc0; map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) // Kernel 3: run the calculation { // Kernel 1: run the calculation #pragma omp target teams distribute parallel for collapse(3) #pragma omp target teams distribute parallel for collapse(3) for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int ixyz = 0; ixyz < nvtot; ixyz++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { Loading