Commit e279e64d authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Prepare nested OpenMP parallelism in TRAPPING

parent 7dd24b46
Loading
Loading
Loading
Loading
+23 −21
Original line number Diff line number Diff line
@@ -398,7 +398,8 @@ void frfme(string data_file, string output_path) {
	  dcomplex *vec_wsum = tfrfme->wsum[0];
	  int size_wsum = nlmmt * nrvc;
	  double *vec_vkzm = vkzm[0];
	  int size_vkzm = nkv * nkv;
	  int nkvs = nkv * nkv;
	  int size_vkzm = nkvs;
	  const dcomplex *vec_tt1_wk = tt1->wk;
	  int size_tt1_wk = nkv * nkv * nlmmt;
	  int size_global_vec_w = nkv * nkv * (jlml - jlmf + 1);
@@ -412,34 +413,34 @@ void frfme(string data_file, string output_path) {
	  device_id = omp_get_default_device();
	  global_vec_w = (dcomplex *)omp_target_alloc(size_global_vec_w * sizeof(dcomplex), device_id);
	  global_w = (dcomplex **)omp_target_alloc(size_global_w * sizeof(dcomplex), device_id);
#pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) \
  map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
  map(to: global_vec_w, global_w) \
  firstprivate(jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks)
#pragma omp target teams distribute parallel for \
  map(tofrom: vec_wsum[0:size_wsum]) \
  map(to: vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv]) \
  map(to: _yv[0:nyv], _zv[0:nzv], global_vec_w, global_w)
#else
	  // Fall-back host work-space allocation
	  global_vec_w =  = new dcomplex[size_global_vec_w]();
	  global_w  = new dcomplex*[size_global_w];
#pragma omp parallel for simd
#endif
#pragma omp parallel for
#endif // USE_TARGET_OFFLOAD
	  for (int j80 = jlmf - 1; j80 < jlml; j80++) {
	    int nkvs = nkv * nkv;
	    dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1);
	    dcomplex **w = global_w + nkv * (j80 - jlmf + 1);
#pragma omp parallel for simd
	    for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv;
	    dcomplex wk_value;
	    int wk_index = 0;
#pragma omp parallel for simd
	    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
		wk_index = nlmmt * jxy50;
		wk_value = vec_tt1_wk[wk_index + j80];
		int wk_index = nlmmt * jxy50;
		dcomplex wk_value = vec_tt1_wk[wk_index + j80];
		int jy50 = jxy50 / nkv;
		int jx50 = jxy50 % nkv;
		vec_w[(nkv * jx50) + jy50] =  wk_value;
	    } // jxy50 loop
	    int ixyz = 0;
#pragma omp parallel for simd
	    for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0;
	    int nvtot = nxv * nyv * nzv;
	    int nvxy = nxv * nyv;
#pragma omp parallel for
	    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
	      int iz75 = ixyz / nvxy;
	      int iy70 = (ixyz % nvxy) / nxv;
@@ -448,6 +449,7 @@ void frfme(string data_file, string output_path) {
	      double y = _yv[iy70];
	      double x = _xv[ix65];
	      dcomplex sumy = cc0;
#pragma omp parallel for simd
	      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
		int jy60 = jy60x55 / nkv;
		int jx55 = jy60x55 % nkv;