Commit 512cf50a authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Create a global host work-space to allocate thread-private device work-spaces

parent 938cbc92
Loading
Loading
Loading
Loading
+12 −5
Original line number Diff line number Diff line
@@ -401,15 +401,22 @@ void frfme(string data_file, string output_path) {
	  int size_vkzm = nkv * nkv;
	  const dcomplex *vec_tt1_wk = tt1->wk;
	  int size_tt1_wk = nkv * nkv * nlmmt;
	  dcomplex *global_vec_w = new dcomplex[nkv * nkv * (jlml - jlmf + 1)]();
	  dcomplex **global_w = new dcomplex*[nkv * (jlml - jlmf + 1)];
	  int size_global_vec_w = nkv * nkv * (jlml - jlmf + 1);
	  int size_global_w = nkv * (jlml - jlmf + 1);
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv], jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks)
#pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) \
  map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
  map(alloc:global_w[0:size_global_w], global_vec_w[0:size_global_vec_w]) \
  firstprivate(jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks)
#else
#pragma omp parallel for simd
#endif
	  for (int j80 = jlmf-1; j80 < jlml; j80++) {
	    int nkvs = nkv * nkv;
	    dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex));
	    dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *));
	    dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1);
	    dcomplex **w = global_w + nkv * (j80 - jlmf + 1);
	    for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv;
	    dcomplex wk_value;
	    int wk_index = 0;
@@ -449,9 +456,9 @@ void frfme(string data_file, string output_path) {
	      } // jy60x55 loop
	      vec_wsum[((j80) * nrvc) + ixyz] = sumy * delks;
	    } // ixyz loop
	    free(vec_w);
	    free(w);
	  } // j80 loop
	  delete[] global_w;
	  delete[] global_vec_w;
#ifdef USE_NVTX
	  nvtxRangePop();
	  nvtxRangePush("Closing operations");