Loading src/trapping/cfrfme.cpp +86 −213 Original line number Diff line number Diff line Loading @@ -66,42 +66,6 @@ #endif #ifdef USE_TARGET_OFFLOAD /*! \brief Specialized function to map data on the device. * * This function prepares a target device to perform the trapping calulation * loop by allocating the device memory and copying the input data. The * operation has been separated from the main loop execution for performance * evaluation purposes and in order to grant data data communication occurs * only when strictly necessary. * * \param vec_wsum: `dcomplex *` Vector of weight sums to be computed. * \param size_vec_wsum: `int` Size of the vector of weight sums to be computed. * \param global_vec_w: `dcomplex *` Global space for weight vectors visible to * all threads. * \param int size_global_vec_w: `int` Size of the global weight vector space. * \param vec_tt1_wk: `dcomplex *` Vector of swap object weights. * \param int size_vec_tt1_wk: `int` Size of the swap object weights vector. * \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT). * \param _xv: `double *` Vector of x-coordinate calculation spacing. * \param nxv: `int` Number of x-coordinates used in the calculation. * \param _yv: `double *` Vector of y-coordinate calculation spacing. * \param nyv: `int` Number of y-coordinates used in the calculation. * \param _zv: `double *` Vector of z-coordinate calculation spacing. * \param nzv: `int` Number of z-coordinates used in the calculation. * \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT) * \param jlmf: `int` First order of the calculation. * \param jlml: `int` Last order of the calculation. * \param nkv: `int` Number of beam vector wave-numbers. * \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field * expansion order. */ void map_data( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt ); /*! \brief Specialized function to perform GPU-offloaded trapping loop. * * The offload of GPU operations through interface layers, such as OpenMP, Loading Loading @@ -142,41 +106,6 @@ void offload_loop( double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt, double delks, double frsh ); /*! \brief Specialized function to clean memory on the device. * * This function cleans the device memory after the calculation loop and * copies the result back to the host system. The operation has been separated * from the main loop execution for performance evaluation purposes and in * order to grant data data communication occurs only when strictly necessary. * * \param vec_wsum: `dcomplex *` Vector of weight sums to be computed. * \param size_vec_wsum: `int` Size of the vector of weight sums to be computed. * \param global_vec_w: `dcomplex *` Global space for weight vectors visible to * all threads. * \param int size_global_vec_w: `int` Size of the global weight vector space. * \param vec_tt1_wk: `dcomplex *` Vector of swap object weights. * \param int size_vec_tt1_wk: `int` Size of the swap object weights vector. * \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT). * \param _xv: `double *` Vector of x-coordinate calculation spacing. * \param nxv: `int` Number of x-coordinates used in the calculation. * \param _yv: `double *` Vector of y-coordinate calculation spacing. * \param nyv: `int` Number of y-coordinates used in the calculation. * \param _zv: `double *` Vector of z-coordinate calculation spacing. * \param nzv: `int` Number of z-coordinates used in the calculation. * \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT) * \param jlmf: `int` First order of the calculation. * \param jlml: `int` Last order of the calculation. * \param nkv: `int` Number of beam vector wave-numbers. * \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field * expansion order. */ void unmap_data( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt ); #endif using namespace std; Loading Loading @@ -539,28 +468,9 @@ void frfme(string data_file, string output_path) { double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD elapsed = t_start - chrono::high_resolution_clock::now(); frfme_duration = elapsed; t_start = chrono::high_resolution_clock::now(); message = "INFO: Mapping data to device.\n"; logger.log(message); #ifdef USE_NVTX nvtxRangePush("Mapping to device"); #endif map_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); #ifdef USE_NVTX nvtxRangePop(); #endif t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: preparing data on device took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); elapsed = t_start - t_end; frfme_duration = elapsed; t_start = chrono::high_resolution_clock::now(); message = "INFO: computing loop.\n"; logger.log(message); Loading @@ -581,26 +491,6 @@ void frfme(string data_file, string output_path) { sprintf(buffer, "INFO: loop calculation took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); t_start = chrono::high_resolution_clock::now(); message = "INFO: cleaning device memory.\n"; logger.log(message); #ifdef USE_NVTX nvtxRangePush("Cleaning device"); #endif unmap_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); #ifdef USE_NVTX nvtxRangePop(); #endif t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: result recovery and device memory clean-up took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); #else #pragma omp parallel for for (int j80 = jlmf - 1; j80 < jlml; j80++) { Loading Loading @@ -740,19 +630,27 @@ void frfme(string data_file, string output_path) { } #ifdef USE_TARGET_OFFLOAD void map_data( void offload_loop( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt const int nkv, const int nlmmt, double delks, double frsh ) { const int nkvs = nkv * nkv; #pragma omp target enter data map(to: vec_wsum[0:size_vec_wsum]) \ int nvtot = nxv * nyv * nzv; int nkvs = nkv * nkv; int nkvmo = nkvmo; int nkvvmo = nkvmo * nkv; int nvxy = nxv * nyv; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; #pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \ map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) { // Kernel 1: prepare the data on the device #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { Loading @@ -765,21 +663,8 @@ void map_data( vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop } } void offload_loop( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt, double delks, double frsh ) { int nvtot = nxv * nyv * nzv; int nkvs = nkv * nkv; int nkvmo = nkvmo; int nkvvmo = nkvmo * nkv; int nvxy = nxv * nyv; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; // Kernel 2: run the calculation #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int ixyz = 0; ixyz < nvtot; ixyz++) { Loading Loading @@ -842,19 +727,7 @@ void offload_loop( vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks; } // ixyz loop } // j80 loop } void unmap_data( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt ) { const int nkvs = nkv * nkv; #pragma omp target exit data map(from: vec_wsum[0:size_vec_wsum]) \ map(delete: global_vec_w[0:size_global_vec_w]) \ map(delete: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(delete: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(delete: vkv[0:nkv], vec_vkzm[0:nkvs]) } // target region } #endif // USE TARGET_OFFLOAD Loading
src/trapping/cfrfme.cpp +86 −213 Original line number Diff line number Diff line Loading @@ -66,42 +66,6 @@ #endif #ifdef USE_TARGET_OFFLOAD /*! \brief Specialized function to map data on the device. * * This function prepares a target device to perform the trapping calulation * loop by allocating the device memory and copying the input data. The * operation has been separated from the main loop execution for performance * evaluation purposes and in order to grant data data communication occurs * only when strictly necessary. * * \param vec_wsum: `dcomplex *` Vector of weight sums to be computed. * \param size_vec_wsum: `int` Size of the vector of weight sums to be computed. * \param global_vec_w: `dcomplex *` Global space for weight vectors visible to * all threads. * \param int size_global_vec_w: `int` Size of the global weight vector space. * \param vec_tt1_wk: `dcomplex *` Vector of swap object weights. * \param int size_vec_tt1_wk: `int` Size of the swap object weights vector. * \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT). * \param _xv: `double *` Vector of x-coordinate calculation spacing. * \param nxv: `int` Number of x-coordinates used in the calculation. * \param _yv: `double *` Vector of y-coordinate calculation spacing. * \param nyv: `int` Number of y-coordinates used in the calculation. * \param _zv: `double *` Vector of z-coordinate calculation spacing. * \param nzv: `int` Number of z-coordinates used in the calculation. * \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT) * \param jlmf: `int` First order of the calculation. * \param jlml: `int` Last order of the calculation. * \param nkv: `int` Number of beam vector wave-numbers. * \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field * expansion order. */ void map_data( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt ); /*! \brief Specialized function to perform GPU-offloaded trapping loop. * * The offload of GPU operations through interface layers, such as OpenMP, Loading Loading @@ -142,41 +106,6 @@ void offload_loop( double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt, double delks, double frsh ); /*! \brief Specialized function to clean memory on the device. * * This function cleans the device memory after the calculation loop and * copies the result back to the host system. The operation has been separated * from the main loop execution for performance evaluation purposes and in * order to grant data data communication occurs only when strictly necessary. * * \param vec_wsum: `dcomplex *` Vector of weight sums to be computed. * \param size_vec_wsum: `int` Size of the vector of weight sums to be computed. * \param global_vec_w: `dcomplex *` Global space for weight vectors visible to * all threads. * \param int size_global_vec_w: `int` Size of the global weight vector space. * \param vec_tt1_wk: `dcomplex *` Vector of swap object weights. * \param int size_vec_tt1_wk: `int` Size of the swap object weights vector. * \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT). * \param _xv: `double *` Vector of x-coordinate calculation spacing. * \param nxv: `int` Number of x-coordinates used in the calculation. * \param _yv: `double *` Vector of y-coordinate calculation spacing. * \param nyv: `int` Number of y-coordinates used in the calculation. * \param _zv: `double *` Vector of z-coordinate calculation spacing. * \param nzv: `int` Number of z-coordinates used in the calculation. * \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT) * \param jlmf: `int` First order of the calculation. * \param jlml: `int` Last order of the calculation. * \param nkv: `int` Number of beam vector wave-numbers. * \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field * expansion order. */ void unmap_data( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt ); #endif using namespace std; Loading Loading @@ -539,28 +468,9 @@ void frfme(string data_file, string output_path) { double *vec_vkzm = vkzm[0]; dcomplex *global_vec_w = new dcomplex[size_global_vec_w]; #ifdef USE_TARGET_OFFLOAD elapsed = t_start - chrono::high_resolution_clock::now(); frfme_duration = elapsed; t_start = chrono::high_resolution_clock::now(); message = "INFO: Mapping data to device.\n"; logger.log(message); #ifdef USE_NVTX nvtxRangePush("Mapping to device"); #endif map_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); #ifdef USE_NVTX nvtxRangePop(); #endif t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: preparing data on device took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); elapsed = t_start - t_end; frfme_duration = elapsed; t_start = chrono::high_resolution_clock::now(); message = "INFO: computing loop.\n"; logger.log(message); Loading @@ -581,26 +491,6 @@ void frfme(string data_file, string output_path) { sprintf(buffer, "INFO: loop calculation took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); t_start = chrono::high_resolution_clock::now(); message = "INFO: cleaning device memory.\n"; logger.log(message); #ifdef USE_NVTX nvtxRangePush("Cleaning device"); #endif unmap_data( vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk, size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml, nkv, nlmmt ); #ifdef USE_NVTX nvtxRangePop(); #endif t_end = chrono::high_resolution_clock::now(); elapsed = t_end - t_start; frfme_duration += elapsed; sprintf(buffer, "INFO: result recovery and device memory clean-up took %lfs.\n", elapsed.count()); message = string(buffer); logger.log(message); #else #pragma omp parallel for for (int j80 = jlmf - 1; j80 < jlml; j80++) { Loading Loading @@ -740,19 +630,27 @@ void frfme(string data_file, string output_path) { } #ifdef USE_TARGET_OFFLOAD void map_data( void offload_loop( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt const int nkv, const int nlmmt, double delks, double frsh ) { const int nkvs = nkv * nkv; #pragma omp target enter data map(to: vec_wsum[0:size_vec_wsum]) \ int nvtot = nxv * nyv * nzv; int nkvs = nkv * nkv; int nkvmo = nkvmo; int nkvvmo = nkvmo * nkv; int nvxy = nxv * nyv; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; #pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \ map(alloc: global_vec_w[0:size_global_vec_w]) \ map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to: vkv[0:nkv], vec_vkzm[0:nkvs]) { // Kernel 1: prepare the data on the device #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { Loading @@ -765,21 +663,8 @@ void map_data( vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop } } void offload_loop( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt, double delks, double frsh ) { int nvtot = nxv * nyv * nzv; int nkvs = nkv * nkv; int nkvmo = nkvmo; int nkvvmo = nkvmo * nkv; int nvxy = nxv * nyv; dcomplex cc0 = 0.0 + I * 0.0; dcomplex uim = 0.0 + I * 1.0; // Kernel 2: run the calculation #pragma omp target teams distribute parallel for collapse(2) for (int j80 = jlmf - 1; j80 < jlml; j80++) { for (int ixyz = 0; ixyz < nvtot; ixyz++) { Loading Loading @@ -842,19 +727,7 @@ void offload_loop( vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks; } // ixyz loop } // j80 loop } void unmap_data( dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w, const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv, double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml, const int nkv, const int nlmmt ) { const int nkvs = nkv * nkv; #pragma omp target exit data map(from: vec_wsum[0:size_vec_wsum]) \ map(delete: global_vec_w[0:size_global_vec_w]) \ map(delete: vec_tt1_wk[0:size_vec_tt1_wk]) \ map(delete: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(delete: vkv[0:nkv], vec_vkzm[0:nkvs]) } // target region } #endif // USE TARGET_OFFLOAD