Use a target data region with multiple kernels instead of separate functions with a target context (43d18f76) · Commits · Giacomo Mulas / NP_TMcode

src/trapping/cfrfme.cpp

+86 −213

Original line number	Diff line number	Diff line
		@@ -66,42 +66,6 @@
		#endif

		#ifdef USE_TARGET_OFFLOAD
		/*! \brief Specialized function to map data on the device.
		*
		* This function prepares a target device to perform the trapping calulation
		* loop by allocating the device memory and copying the input data. The
		* operation has been separated from the main loop execution for performance
		* evaluation purposes and in order to grant data data communication occurs
		* only when strictly necessary.
		*
		* \param vec_wsum: `dcomplex *` Vector of weight sums to be computed.
		* \param size_vec_wsum: `int` Size of the vector of weight sums to be computed.
		* \param global_vec_w: `dcomplex *` Global space for weight vectors visible to
		* all threads.
		* \param int size_global_vec_w: `int` Size of the global weight vector space.
		* \param vec_tt1_wk: `dcomplex *` Vector of swap object weights.
		* \param int size_vec_tt1_wk: `int` Size of the swap object weights vector.
		* \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT).
		* \param _xv: `double *` Vector of x-coordinate calculation spacing.
		* \param nxv: `int` Number of x-coordinates used in the calculation.
		* \param _yv: `double *` Vector of y-coordinate calculation spacing.
		* \param nyv: `int` Number of y-coordinates used in the calculation.
		* \param _zv: `double *` Vector of z-coordinate calculation spacing.
		* \param nzv: `int` Number of z-coordinates used in the calculation.
		* \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT)
		* \param jlmf: `int` First order of the calculation.
		* \param jlml: `int` Last order of the calculation.
		* \param nkv: `int` Number of beam vector wave-numbers.
		* \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field
		* expansion order.
		*/
		void map_data(
		dcomplex vec_wsum, const int size_vec_wsum, dcomplex global_vec_w, const int size_global_vec_w,
		const dcomplex vec_tt1_wk, const int size_vec_tt1_wk, double vkv, double *_xv, const int nxv,
		double _yv, int nyv, double _zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
		const int nkv, const int nlmmt
		);

		/*! \brief Specialized function to perform GPU-offloaded trapping loop.
		*
		* The offload of GPU operations through interface layers, such as OpenMP,
		@@ -142,41 +106,6 @@ void offload_loop(
		double _yv, const int nyv, double _zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
		const int nkv, const int nlmmt, double delks, double frsh
		);

		/*! \brief Specialized function to clean memory on the device.
		*
		* This function cleans the device memory after the calculation loop and
		* copies the result back to the host system. The operation has been separated
		* from the main loop execution for performance evaluation purposes and in
		* order to grant data data communication occurs only when strictly necessary.
		*
		* \param vec_wsum: `dcomplex *` Vector of weight sums to be computed.
		* \param size_vec_wsum: `int` Size of the vector of weight sums to be computed.
		* \param global_vec_w: `dcomplex *` Global space for weight vectors visible to
		* all threads.
		* \param int size_global_vec_w: `int` Size of the global weight vector space.
		* \param vec_tt1_wk: `dcomplex *` Vector of swap object weights.
		* \param int size_vec_tt1_wk: `int` Size of the swap object weights vector.
		* \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT).
		* \param _xv: `double *` Vector of x-coordinate calculation spacing.
		* \param nxv: `int` Number of x-coordinates used in the calculation.
		* \param _yv: `double *` Vector of y-coordinate calculation spacing.
		* \param nyv: `int` Number of y-coordinates used in the calculation.
		* \param _zv: `double *` Vector of z-coordinate calculation spacing.
		* \param nzv: `int` Number of z-coordinates used in the calculation.
		* \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT)
		* \param jlmf: `int` First order of the calculation.
		* \param jlml: `int` Last order of the calculation.
		* \param nkv: `int` Number of beam vector wave-numbers.
		* \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field
		* expansion order.
		*/
		void unmap_data(
		dcomplex vec_wsum, const int size_vec_wsum, dcomplex global_vec_w, const int size_global_vec_w,
		const dcomplex vec_tt1_wk, const int size_vec_tt1_wk, double vkv, double *_xv, const int nxv,
		double _yv, const int nyv, double _zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
		const int nkv, const int nlmmt
		);
		#endif

		using namespace std;
		@@ -539,28 +468,9 @@ void frfme(string data_file, string output_path) {
		double *vec_vkzm = vkzm[0];
		dcomplex *global_vec_w = new dcomplex[size_global_vec_w];
		#ifdef USE_TARGET_OFFLOAD
		elapsed = t_start - chrono::high_resolution_clock::now();
		frfme_duration = elapsed;
		t_start = chrono::high_resolution_clock::now();
		message = "INFO: Mapping data to device.\n";
		logger.log(message);
		#ifdef USE_NVTX
		nvtxRangePush("Mapping to device");
		#endif
		map_data(
		vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk,
		size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml,
		nkv, nlmmt
		);
		#ifdef USE_NVTX
		nvtxRangePop();
		#endif
		t_end = chrono::high_resolution_clock::now();
		elapsed = t_end - t_start;
		frfme_duration += elapsed;
		sprintf(buffer, "INFO: preparing data on device took %lfs.\n", elapsed.count());
		message = string(buffer);
		logger.log(message);
		elapsed = t_start - t_end;
		frfme_duration = elapsed;
		t_start = chrono::high_resolution_clock::now();
		message = "INFO: computing loop.\n";
		logger.log(message);
		@@ -581,26 +491,6 @@ void frfme(string data_file, string output_path) {
		sprintf(buffer, "INFO: loop calculation took %lfs.\n", elapsed.count());
		message = string(buffer);
		logger.log(message);
		t_start = chrono::high_resolution_clock::now();
		message = "INFO: cleaning device memory.\n";
		logger.log(message);
		#ifdef USE_NVTX
		nvtxRangePush("Cleaning device");
		#endif
		unmap_data(
		vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk,
		size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml,
		nkv, nlmmt
		);
		#ifdef USE_NVTX
		nvtxRangePop();
		#endif
		t_end = chrono::high_resolution_clock::now();
		elapsed = t_end - t_start;
		frfme_duration += elapsed;
		sprintf(buffer, "INFO: result recovery and device memory clean-up took %lfs.\n", elapsed.count());
		message = string(buffer);
		logger.log(message);
		#else
		#pragma omp parallel for
		for (int j80 = jlmf - 1; j80 < jlml; j80++) {
		@@ -740,19 +630,27 @@ void frfme(string data_file, string output_path) {
		}

		#ifdef USE_TARGET_OFFLOAD
		void map_data(
		void offload_loop(
		dcomplex vec_wsum, const int size_vec_wsum, dcomplex global_vec_w, const int size_global_vec_w,
		const dcomplex vec_tt1_wk, const int size_vec_tt1_wk, double vkv, double *_xv, const int nxv,
		double _yv, const int nyv, double _zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
		const int nkv, const int nlmmt
		const int nkv, const int nlmmt, double delks, double frsh
		) {
		const int nkvs = nkv * nkv;
		#pragma omp target enter data map(to: vec_wsum[0:size_vec_wsum]) \
		int nvtot = nxv * nyv * nzv;
		int nkvs = nkv * nkv;
		int nkvmo = nkvmo;
		int nkvvmo = nkvmo * nkv;
		int nvxy = nxv * nyv;
		dcomplex cc0 = 0.0 + I * 0.0;
		dcomplex uim = 0.0 + I * 1.0;

		#pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \
		map(alloc: global_vec_w[0:size_global_vec_w]) \
		map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \
		map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
		map(to: vkv[0:nkv], vec_vkzm[0:nkvs])

		{
		// Kernel 1: prepare the data on the device
		#pragma omp target teams distribute parallel for collapse(2)
		for (int j80 = jlmf - 1; j80 < jlml; j80++) {
		for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
		@@ -765,21 +663,8 @@ void map_data(
		vec_w[(nkv * jx50) + jy50] = wk_value;
		} // jxy50 loop
		}
		}

		void offload_loop(
		dcomplex vec_wsum, const int size_vec_wsum, dcomplex global_vec_w, const int size_global_vec_w,
		const dcomplex vec_tt1_wk, const int size_vec_tt1_wk, double vkv, double *_xv, const int nxv,
		double _yv, const int nyv, double _zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
		const int nkv, const int nlmmt, double delks, double frsh
		) {
		int nvtot = nxv * nyv * nzv;
		int nkvs = nkv * nkv;
		int nkvmo = nkvmo;
		int nkvvmo = nkvmo * nkv;
		int nvxy = nxv * nyv;
		dcomplex cc0 = 0.0 + I * 0.0;
		dcomplex uim = 0.0 + I * 1.0;
		// Kernel 2: run the calculation
		#pragma omp target teams distribute parallel for collapse(2)
		for (int j80 = jlmf - 1; j80 < jlml; j80++) {
		for (int ixyz = 0; ixyz < nvtot; ixyz++) {
		@@ -842,19 +727,7 @@ void offload_loop(
		vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks;
		} // ixyz loop
		} // j80 loop
		}

		void unmap_data(
		dcomplex vec_wsum, const int size_vec_wsum, dcomplex global_vec_w, const int size_global_vec_w,
		const dcomplex vec_tt1_wk, const int size_vec_tt1_wk, double vkv, double *_xv, const int nxv,
		double _yv, const int nyv, double _zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
		const int nkv, const int nlmmt
		) {
		const int nkvs = nkv * nkv;
		#pragma omp target exit data map(from: vec_wsum[0:size_vec_wsum]) \
		map(delete: global_vec_w[0:size_global_vec_w]) \
		map(delete: vec_tt1_wk[0:size_vec_tt1_wk]) \
		map(delete: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
		map(delete: vkv[0:nkv], vec_vkzm[0:nkvs])
		} // target region
		}
		#endif // USE TARGET_OFFLOAD