Loading src/trapping/cfrfme.cpp +32 −12 Original line number Diff line number Diff line Loading @@ -192,7 +192,7 @@ void frfme(string data_file, string output_path) { #endif chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now(); chrono::time_point<chrono::high_resolution_clock> t_end; chrono::duration<double> elapsed; chrono::duration<double> elapsed, frfme_duration; char buffer[256]; string message = "INIT"; Logger logger(LOG_INFO); Loading Loading @@ -539,42 +539,52 @@ void frfme(string data_file, string output_path) { double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD frfme_duration = t_start - chrono::high_resolution_clock::now(); t_start = chrono::high_resolution_clock::now(); message = "INFO: Mapping data to device.\n"; logger.log(message); nvtxRangePush("Mapping to device"); map_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); nvtxRangePop(); t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; sprintf(buffer, "INFO: copying data to device took %lfs.\n", elapsed.count()); frfme_duration += elapsed; sprintf(buffer, "INFO: preparing data on device took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); t_start = chrono::high_resolution_clock::now(); message = "INFO: computing loop.\n"; logger.log(message); nvtxRangePush("Offloaded loop"); offload_loop( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt, delks, frsh ); nvtxRangePop(); t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: loop calculation took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); t_start = chrono::high_resolution_clock::now(); message = "INFO: cleaning device memory.\n"; logger.log(message); nvtxRangePush("Cleaning device"); unmap_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); nvtxRangePop(); t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: result recovery and device memory clean-up took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); Loading Loading @@ -697,7 +707,12 @@ void frfme(string data_file, string output_path) { nvtxRangePop(); #endif elapsed = chrono::high_resolution_clock::now() - t_start; message = "INFO: FRFME took " + to_string(elapsed.count()) + "s.\n"; #ifdef USE_TARGET_OFFLOAD frfme_duration += elapsed; #else frfme_duration = elapsed; #endif message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n"; logger.log(message); #ifdef USE_NVTX nvtxRangePop(); Loading @@ -717,6 +732,19 @@ void map_data( map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80_index]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop } } void offload_loop( Loading @@ -732,18 +760,10 @@ void offload_loop( int nvxy = nxv * nyv; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; #pragma omp target #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80_index]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop #pragma omp teams distribute parallel for for (int ixyz = 0; ixyz < nrvc; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; Loading Loading
src/trapping/cfrfme.cpp +32 −12 Original line number Diff line number Diff line Loading @@ -192,7 +192,7 @@ void frfme(string data_file, string output_path) { #endif chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now(); chrono::time_point<chrono::high_resolution_clock> t_end; chrono::duration<double> elapsed; chrono::duration<double> elapsed, frfme_duration; char buffer[256]; string message = "INIT"; Logger logger(LOG_INFO); Loading Loading @@ -539,42 +539,52 @@ void frfme(string data_file, string output_path) { double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD frfme_duration = t_start - chrono::high_resolution_clock::now(); t_start = chrono::high_resolution_clock::now(); message = "INFO: Mapping data to device.\n"; logger.log(message); nvtxRangePush("Mapping to device"); map_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); nvtxRangePop(); t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; sprintf(buffer, "INFO: copying data to device took %lfs.\n", elapsed.count()); frfme_duration += elapsed; sprintf(buffer, "INFO: preparing data on device took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); t_start = chrono::high_resolution_clock::now(); message = "INFO: computing loop.\n"; logger.log(message); nvtxRangePush("Offloaded loop"); offload_loop( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt, delks, frsh ); nvtxRangePop(); t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: loop calculation took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); t_start = chrono::high_resolution_clock::now(); message = "INFO: cleaning device memory.\n"; logger.log(message); nvtxRangePush("Cleaning device"); unmap_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); nvtxRangePop(); t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: result recovery and device memory clean-up took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); Loading Loading @@ -697,7 +707,12 @@ void frfme(string data_file, string output_path) { nvtxRangePop(); #endif elapsed = chrono::high_resolution_clock::now() - t_start; message = "INFO: FRFME took " + to_string(elapsed.count()) + "s.\n"; #ifdef USE_TARGET_OFFLOAD frfme_duration += elapsed; #else frfme_duration = elapsed; #endif message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n"; logger.log(message); #ifdef USE_NVTX nvtxRangePop(); Loading @@ -717,6 +732,19 @@ void map_data( map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80_index]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop } } void offload_loop( Loading @@ -732,18 +760,10 @@ void offload_loop( int nvxy = nxv * nyv; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; #pragma omp target #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { int j80_index = j80 - jlmf + 1; dcomplex *vec_w = global_vec_w + nkvs * j80_index; for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { int wk_index = nlmmt * jxy50; dcomplex wk_value = vec_tt1_wk[wk_index + j80_index]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop #pragma omp teams distribute parallel for for (int ixyz = 0; ixyz < nrvc; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; Loading