Commit 43d18f76 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use a target data region with multiple kernels instead of separate functions with a target context

parent e940395a
Loading
Loading
Loading
Loading
+86 −213
Original line number Diff line number Diff line
@@ -66,42 +66,6 @@
#endif

#ifdef USE_TARGET_OFFLOAD
/*! \brief Specialized function to map data on the device.
 *
 * This function prepares a target device to perform the trapping calulation
 * loop by allocating the device memory and copying the input data. The
 * operation has been separated from the main loop execution for performance
 * evaluation purposes and in order to grant data data communication occurs
 * only when strictly necessary.
 *
 * \param vec_wsum: `dcomplex *` Vector of weight sums to be computed.
 * \param size_vec_wsum: `int` Size of the vector of weight sums to be computed.
 * \param global_vec_w: `dcomplex *` Global space for weight vectors visible to
 * all threads.
 * \param int size_global_vec_w: `int` Size of the global weight vector space.
 * \param vec_tt1_wk: `dcomplex *` Vector of swap object weights.
 * \param int size_vec_tt1_wk: `int` Size of the swap object weights vector.
 * \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT).
 * \param _xv: `double *` Vector of x-coordinate calculation spacing.
 * \param nxv: `int` Number of x-coordinates used in the calculation.
 * \param _yv: `double *` Vector of y-coordinate calculation spacing.
 * \param nyv: `int` Number of y-coordinates used in the calculation.
 * \param _zv: `double *` Vector of z-coordinate calculation spacing.
 * \param nzv: `int` Number of z-coordinates used in the calculation.
 * \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT)
 * \param jlmf: `int` First order of the calculation.
 * \param jlml: `int` Last order of the calculation.
 * \param nkv: `int` Number of beam vector wave-numbers.
 * \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field
 * expansion order.
 */
void map_data(
  dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w,
  const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv,
  double *_yv, int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
  const int nkv, const int nlmmt
);

/*! \brief Specialized function to perform GPU-offloaded trapping loop.
 *
 * The offload of GPU operations through interface layers, such as OpenMP,
@@ -142,41 +106,6 @@ void offload_loop(
  double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
  const int nkv, const int nlmmt, double delks, double frsh
);

/*! \brief Specialized function to clean memory on the device.
 *
 * This function cleans the device memory after the calculation loop and
 * copies the result back to the host system. The operation has been separated
 * from the main loop execution for performance evaluation purposes and in
 * order to grant data data communication occurs only when strictly necessary.
 *
 * \param vec_wsum: `dcomplex *` Vector of weight sums to be computed.
 * \param size_vec_wsum: `int` Size of the vector of weight sums to be computed.
 * \param global_vec_w: `dcomplex *` Global space for weight vectors visible to
 * all threads.
 * \param int size_global_vec_w: `int` Size of the global weight vector space.
 * \param vec_tt1_wk: `dcomplex *` Vector of swap object weights.
 * \param int size_vec_tt1_wk: `int` Size of the swap object weights vector.
 * \param vkv: `double *` Vector of beam fields wave numbers (with size = NLMMT).
 * \param _xv: `double *` Vector of x-coordinate calculation spacing.
 * \param nxv: `int` Number of x-coordinates used in the calculation.
 * \param _yv: `double *` Vector of y-coordinate calculation spacing.
 * \param nyv: `int` Number of y-coordinates used in the calculation.
 * \param _zv: `double *` Vector of z-coordinate calculation spacing.
 * \param nzv: `int` Number of z-coordinates used in the calculation.
 * \param vec_vkzm: `double *` Vectorized VKZM matrix (size = NLMMT x NLMMT)
 * \param jlmf: `int` First order of the calculation.
 * \param jlml: `int` Last order of the calculation.
 * \param nkv: `int` Number of beam vector wave-numbers.
 * \param nlmmt: `int` NLMMT = 2 x LM x (LM + 2), with LM as maximum field
 * expansion order.
 */
void unmap_data(
  dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w,
  const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv,
  double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
  const int nkv, const int nlmmt
);
#endif

using namespace std;
@@ -539,28 +468,9 @@ void frfme(string data_file, string output_path) {
	  double *vec_vkzm = vkzm[0];
	  dcomplex *global_vec_w = new dcomplex[size_global_vec_w];
#ifdef USE_TARGET_OFFLOAD
	  elapsed = t_start - chrono::high_resolution_clock::now();
	  frfme_duration = elapsed;
	  t_start = chrono::high_resolution_clock::now();
	  message = "INFO: Mapping data to device.\n";
	  logger.log(message);
#ifdef USE_NVTX
	  nvtxRangePush("Mapping to device");
#endif
	  map_data(
            vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk,
	    size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml,
	    nkv, nlmmt
          );
#ifdef USE_NVTX
	  nvtxRangePop();
#endif
	  t_end = chrono::high_resolution_clock::now();
	  elapsed = t_end - t_start;
	  frfme_duration += elapsed;
	  sprintf(buffer, "INFO: preparing data on device took %lfs.\n", elapsed.count());
	  message = string(buffer);
	  logger.log(message);
	  elapsed = t_start - t_end;
	  frfme_duration = elapsed;
	  t_start = chrono::high_resolution_clock::now();
	  message = "INFO: computing loop.\n";
	  logger.log(message);
@@ -581,26 +491,6 @@ void frfme(string data_file, string output_path) {
	  sprintf(buffer, "INFO: loop calculation took %lfs.\n", elapsed.count());
	  message = string(buffer);
	  logger.log(message);
	  t_start = chrono::high_resolution_clock::now();
	  message = "INFO: cleaning device memory.\n";
	  logger.log(message);
#ifdef USE_NVTX
	  nvtxRangePush("Cleaning device");
#endif
	  unmap_data(
            vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk,
	    size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml,
	    nkv, nlmmt
          );
#ifdef USE_NVTX
	  nvtxRangePop();
#endif
	  t_end = chrono::high_resolution_clock::now();
	  elapsed = t_end - t_start;
	  frfme_duration += elapsed;
	  sprintf(buffer, "INFO: result recovery and device memory clean-up took %lfs.\n", elapsed.count());
	  message = string(buffer);
	  logger.log(message);
#else
#pragma omp parallel for
	  for (int j80 = jlmf - 1; j80 < jlml; j80++) {
@@ -740,19 +630,27 @@ void frfme(string data_file, string output_path) {
}

#ifdef USE_TARGET_OFFLOAD
void map_data(
void offload_loop(
  dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w,
  const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv,
  double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
  const int nkv, const int nlmmt
  const int nkv, const int nlmmt, double delks, double frsh
) {
  const int nkvs = nkv * nkv;
#pragma omp target enter data map(to: vec_wsum[0:size_vec_wsum]) \
  int nvtot = nxv * nyv * nzv;
  int nkvs = nkv * nkv;
  int nkvmo = nkvmo;
  int nkvvmo = nkvmo * nkv;
  int nvxy = nxv * nyv;
  dcomplex cc0 = 0.0 + I * 0.0;
  dcomplex uim = 0.0 + I * 1.0;

#pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \
  map(alloc: global_vec_w[0:size_global_vec_w]) \
  map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \
  map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
  map(to: vkv[0:nkv], vec_vkzm[0:nkvs])

  {
    // Kernel 1: prepare the data on the device
#pragma omp target teams distribute parallel for collapse(2)
    for (int j80 = jlmf - 1; j80 < jlml; j80++) {
      for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
@@ -765,21 +663,8 @@ void map_data(
	vec_w[(nkv * jx50) + jy50] = wk_value;
      } // jxy50 loop
    }  
}

void offload_loop(
  dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w,
  const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv,
  double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
  const int nkv, const int nlmmt, double delks, double frsh
) {
  int nvtot = nxv * nyv * nzv;
  int nkvs = nkv * nkv;
  int nkvmo = nkvmo;
  int nkvvmo = nkvmo * nkv;
  int nvxy = nxv * nyv;
  dcomplex cc0 = 0.0 + I * 0.0;
  dcomplex uim = 0.0 + I * 1.0;
    // Kernel 2: run the calculation
#pragma omp target teams distribute parallel for collapse(2)
    for (int j80 = jlmf - 1; j80 < jlml; j80++) {
      for (int ixyz = 0; ixyz < nvtot; ixyz++) {
@@ -842,19 +727,7 @@ void offload_loop(
	vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks;
      } // ixyz loop
    } // j80 loop
}
    
void unmap_data(
  dcomplex *vec_wsum, const int size_vec_wsum, dcomplex *global_vec_w, const int size_global_vec_w,
  const dcomplex *vec_tt1_wk, const int size_vec_tt1_wk, double *vkv, double *_xv, const int nxv,
  double *_yv, const int nyv, double *_zv, const int nzv, double *vec_vkzm, const int jlmf, const int jlml,
  const int nkv, const int nlmmt
) {
  const int nkvs = nkv * nkv;
#pragma omp target exit data map(from: vec_wsum[0:size_vec_wsum]) \
  map(delete: global_vec_w[0:size_global_vec_w]) \
  map(delete: vec_tt1_wk[0:size_vec_tt1_wk]) \
  map(delete: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
  map(delete: vkv[0:nkv], vec_vkzm[0:nkvs])
  } // target region
}
#endif // USE TARGET_OFFLOAD