Commit d081ed92 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use CPU to initialize global work space and result vector, then run a single GPU kernel

parent a4af2237
Loading
Loading
Loading
Loading
+20 −26
Original line number Original line Diff line number Diff line
@@ -466,7 +466,7 @@ void frfme(string data_file, string output_path) {
	  const dcomplex *vec_tt1_wk = tt1->wk;
	  const dcomplex *vec_tt1_wk = tt1->wk;
	  dcomplex *vec_wsum = tfrfme->wsum[0];
	  dcomplex *vec_wsum = tfrfme->wsum[0];
	  double *vec_vkzm = vkzm[0];
	  double *vec_vkzm = vkzm[0];
	  dcomplex *global_vec_w = new dcomplex[size_global_vec_w];
	  dcomplex *global_vec_w = new dcomplex[size_global_vec_w]();
	  message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n";
	  message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n";
	  logger.log(message);
	  logger.log(message);
#ifdef USE_TARGET_OFFLOAD
#ifdef USE_TARGET_OFFLOAD
@@ -617,12 +617,12 @@ void frfme(string data_file, string output_path) {
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePop();
  nvtxRangePop();
#endif
#endif
#ifndef USE_TARGET_OFFLOAD
#ifdef USE_TARGET_OFFLOAD
  elapsed = chrono::high_resolution_clock::now() - t_start;
  frfme_duration = elapsed;
#else
  elapsed = chrono::high_resolution_clock::now() - t_end;
  elapsed = chrono::high_resolution_clock::now() - t_end;
  frfme_duration += elapsed;
  frfme_duration += elapsed;
#else
  elapsed = chrono::high_resolution_clock::now() - t_start;
  frfme_duration = elapsed;
#endif
#endif
  message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n";
  message = "INFO: FRFME took " + to_string(frfme_duration.count()) + "s.\n";
  logger.log(message);
  logger.log(message);
@@ -646,14 +646,7 @@ void offload_loop(
  dcomplex cc0 = 0.0 + I * 0.0;
  dcomplex cc0 = 0.0 + I * 0.0;
  dcomplex uim = 0.0 + I * 1.0;
  dcomplex uim = 0.0 + I * 1.0;


#pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \
#pragma omp parallel for simd collapse(2)
  map(alloc: global_vec_w[0:size_global_vec_w]) \
  map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \
  map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
  map(to: vkv[0:nkv], vec_vkzm[0:nkvs])
  {
    // Kernel 1: prepare the data on the device
#pragma omp target teams distribute parallel for collapse(2)
  for (int j80 = jlmf - 1; j80 < jlml; j80++) {
  for (int j80 = jlmf - 1; j80 < jlml; j80++) {
    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
      int j80_index = j80 - jlmf + 1;
      int j80_index = j80 - jlmf + 1;
@@ -666,12 +659,13 @@ void offload_loop(
    } // jxy50 loop
    } // jxy50 loop
  }
  }


    // Kernel 2: ensure that vec_wsum is initialized at 0
#pragma omp target data map(tofrom: vec_wsum[0:size_vec_wsum]) \
#pragma omp target teams distribute parallel for
  map(to: global_vec_w[0:size_global_vec_w]) \
    for (long i = 0; i < size_vec_wsum; i++)
  map(to: vec_tt1_wk[0:size_vec_tt1_wk]) \
      vec_wsum[i] = cc0;
  map(to: _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \

  map(to: vkv[0:nkv], vec_vkzm[0:nkvs])
    // Kernel 3: run the calculation
  {
    // Kernel 1: run the calculation
#pragma omp target teams distribute parallel for collapse(3)
#pragma omp target teams distribute parallel for collapse(3)
    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {