Commit 5795b9a1 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Flatten offloaded loop and add stage-dependent benchmark timers

parent 16e0d698
Loading
Loading
Loading
Loading
+32 −12
Original line number Diff line number Diff line
@@ -192,7 +192,7 @@ void frfme(string data_file, string output_path) {
#endif
  chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now();
  chrono::time_point<chrono::high_resolution_clock> t_end;
  chrono::duration<double> elapsed;
  chrono::duration<double> elapsed, frfme_duration;
  char buffer[256];
  string message = "INIT";
  Logger logger(LOG_INFO);
@@ -539,42 +539,52 @@ void frfme(string data_file, string output_path) {
	  double *vec_vkzm = vkzm[0];
	  dcomplex *global_vec_w = new dcomplex[size_global_vec_w];
#ifdef USE_TARGET_OFFLOAD
	  frfme_duration = t_start - chrono::high_resolution_clock::now();
	  t_start = chrono::high_resolution_clock::now();
	  message = "INFO: Mapping data to device.\n";
	  logger.log(message);
	  nvtxRangePush("Mapping to device");
	  map_data(
            vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk,
	    size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml,
	    nkv, nlmmt
          );
	  nvtxRangePop();
	  t_end = chrono::high_resolution_clock::now();
	  elapsed = t_end - t_start;
	  sprintf(buffer, "INFO: copying data to device took %lfs.\n", elapsed.count());
	  frfme_duration += elapsed;
	  sprintf(buffer, "INFO: preparing data on device took %lfs.\n", elapsed.count());
	  message = string(buffer);
	  logger.log(message);
	  t_start = chrono::high_resolution_clock::now();
	  message = "INFO: computing loop.\n";
	  logger.log(message);
	  nvtxRangePush("Offloaded loop");
	  offload_loop(
            vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk,
	    size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml,
	    nkv, nlmmt, delks, frsh
          );
	  nvtxRangePop();
	  t_end = chrono::high_resolution_clock::now();
	  elapsed = t_end - t_start;
	  frfme_duration += elapsed;
	  sprintf(buffer, "INFO: loop calculation took %lfs.\n", elapsed.count());
	  message = string(buffer);
	  logger.log(message);
	  t_start = chrono::high_resolution_clock::now();
	  message = "INFO: cleaning device memory.\n";
	  logger.log(message);
	  nvtxRangePush("Cleaning device");
	  unmap_data(
            vec_wsum, size_vec_wsum, global_vec_w, size_global_vec_w, vec_tt1_wk,
	    size_vec_tt1_wk, vkv, _xv, nxv, _yv, nyv, _zv, nzv, vec_vkzm, jlmf, jlml,
	    nkv, nlmmt
          );
	  nvtxRangePop();
	  t_end = chrono::high_resolution_clock::now();
	  elapsed = t_end - t_start;
	  frfme_duration += elapsed;
	  sprintf(buffer, "INFO: result recovery and device memory clean-up took %lfs.\n", elapsed.count());
	  message = string(buffer);
	  logger.log(message);
@@ -697,7 +707,12 @@ void frfme(string data_file, string output_path) {
  nvtxRangePop();
#endif
  elapsed = chrono::high_resolution_clock::now() - t_start;
  message = "INFO: FRFME took " + to_string(elapsed.count()) + "s.\n";
#ifdef USE_TARGET_OFFLOAD
  frfme_duration += elapsed;
#else
  frfme_duration = elapsed;
#endif
  message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n";
  logger.log(message);
#ifdef USE_NVTX
  nvtxRangePop();
@@ -717,6 +732,19 @@ void map_data(
  map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \
  map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
  map(to: vkv[0:nkv], vec_vkzm[0:nkvs])

#pragma omp target teams distribute parallel for collapse(2)
  for (int j80 = jlmf - 1; j80 < jlml; j80++) {
    int j80_index = j80 - jlmf + 1;
    dcomplex *vec_w = global_vec_w + nkvs * j80_index;
    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
      int wk_index = nlmmt * jxy50;
      dcomplex wk_value = vec_tt1_wk[wk_index + j80_index];
      int jy50 = jxy50 / nkv;
      int jx50 = jxy50 % nkv;
      vec_w[(nkv * jx50) + jy50] = wk_value;
    } // jxy50 loop
  }  
}

void offload_loop(
@@ -732,18 +760,10 @@ void offload_loop(
  int nvxy = nxv * nyv;
  dcomplex cc0 = 0.0 + I * 0.0;
  dcomplex uim = 0.0 + I * 1.0;
#pragma omp target
#pragma omp target teams distribute parallel for collapse(2)
  for (int j80 = jlmf - 1; j80 < jlml; j80++) {
    int j80_index = j80 - jlmf + 1;
    dcomplex *vec_w = global_vec_w + nkvs * j80_index;
    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
      int wk_index = nlmmt * jxy50;
      dcomplex wk_value = vec_tt1_wk[wk_index + j80_index];
      int jy50 = jxy50 / nkv;
      int jx50 = jxy50 % nkv;
      vec_w[(nkv * jx50) + jy50] = wk_value;
    } // jxy50 loop
#pragma omp teams distribute parallel for
    for (int ixyz = 0; ixyz < nrvc; ixyz++) {
      int iz75 = ixyz / nvxy;
      int iy70 = (ixyz % nvxy) / nxv;