Commit 704e038d authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

linearise loops in cfrfme.cpp, to prepare for omp target offload

parent 4a8cfbd6
Loading
Loading
Loading
Loading
+80 −44
Original line number Original line Diff line number Diff line
@@ -61,6 +61,14 @@
#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvToolsExt.h>
#endif
#endif


#ifdef _OPENMP
#include <omp.h>
#endif

#ifdef USE_TARGET_OFFLOAD
#pragma omp requires unified_shared_memory
#endif

using namespace std;
using namespace std;


/*! \brief C++ implementation of FRFME
/*! \brief C++ implementation of FRFME
@@ -391,57 +399,84 @@ void frfme(string data_file, string output_path) {
#ifdef USE_NVTX
#ifdef USE_NVTX
	  nvtxRangePush("j80 loop");
	  nvtxRangePush("j80 loop");
#endif
#endif
#pragma omp parallel for
	  dcomplex *vec_wsum = tfrfme->wsum[0];
	  for (int j80 = jlmf; j80 <= jlml; j80++) {
	  double *vec_vkzm = vkzm[0];
	    dcomplex *vec_w = new dcomplex[nkv * nkv]();
#ifdef USE_TARGET_OFFLOAD
	    dcomplex **w = new dcomplex*[nkv];
#pragma omp target teams distribute parallel for simd
#endif
// #pragma omp parallel for
	  for (int j80 = jlmf-1; j80 < jlml; j80++) {
	    int nkvs = nkv * nkv;
	    dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex));
	    dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *));
	    // dcomplex *wk_local = new dcomplex[nlmmt]();
	    // dcomplex *wk_local = new dcomplex[nlmmt]();
	    for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv;
	    for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv;
	    dcomplex wk_value;
	    dcomplex wk_value;
	    int wk_index = 0;
	    int wk_index = 0;
	    int nkvs = nkv * nkv;
	    // for (int jy50 = 0; jy50 < nkv; jy50++) {
	    for (int jy50 = 0; jy50 < nkv; jy50++) {
	    //   for (int jx50 = 0; jx50 < nkv; jx50++) {
	      for (int jx50 = 0; jx50 < nkv; jx50++) {
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for simd
// #endif
// #pragma omp parallel for
	    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
		// for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++];
		// for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++];
		// w[jx50][jy50] = wk_local[j80 - 1];
		// w[jx50][jy50] = wk_local[j80];
		wk_value = tt1->wk[wk_index + j80 - 1];
		wk_index = nlmmt*jxy50;
		wk_index += nlmmt;
		wk_value = tt1->wk[wk_index + j80];
		w[jx50][jy50] = wk_value;
		// wk_index += nlmmt;
	      } // jx50
		int jy50 = jxy50 / nkv;
	    } // jy50 loop
		int jx50 = jxy50 % nkv;
		vec_w[(nkv*jx50) + jy50] =  wk_value;
		// w[jx50][jy50] = wk_value;
	    } // jxy50 loop
	    //   } // jx50
	    // } // jy50 loop
	    int ixyz = 0;
	    int ixyz = 0;
	    for (int wj = 0; wj < nrvc; wj++) tfrfme->wsum[j80 - 1][wj] = cc0;
	    for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0;
	    for (int iz75 = 0; iz75 < nzv; iz75++) {
	    int nvtot = nxv*nyv*nzv;
	    int nvxy = nxv *nyv;
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for
// #endif
// #pragma omp parallel for
	    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
	      int iz75 = ixyz / nvxy;
	      int iy70 = (ixyz % nvxy) / nxv;
	      int ix65 = ixyz % nxv;
	    // for (int iz75 = 0; iz75 < nzv; iz75++) {
	      double z = _zv[iz75] + frsh;
	      double z = _zv[iz75] + frsh;
	      for (int iy70 = 0; iy70 < nyv; iy70++) {
	      // for (int iy70 = 0; iy70 < nyv; iy70++) {
	      double y = _yv[iy70];
	      double y = _yv[iy70];
		for (int ix65 = 0; ix65 < nxv; ix65++) {
		// for (int ix65 = 0; ix65 < nxv; ix65++) {
	      double x = _xv[ix65];
	      double x = _xv[ix65];
		  ixyz++;
		  // ixyz++;
	      dcomplex sumy = cc0;
	      dcomplex sumy = cc0;
		  for (int jy60 = 0; jy60 < nkv; jy60++) {
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target parallel for simd reduction(+:sumy)
// #endif
	      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
		int jy60 = jy60x55 / nkv;
		int jx55 = jy60x55 % nkv;
		double vky = vkv[jy60];
		double vky = vkv[jy60];
		    double vkx = vkv[nkv - 1];
		double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55];
		    double vkzf = vkzm[0][jy60];
		double vkz = vec_vkzm[(jx55*nkv)+jy60];
		    dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z));
		dcomplex phas = (jx55==0) ?
		    double vkzl = vkzm[nkv - 1][jy60];
		  cexp(uim * (-vkx * x + vky * y + vkz * z)):
		    dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z));
		  cexp(uim * (vkx * x + vky * y + vkz * z));
		    dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl);
		dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas;
		    for (int jx55 = 2; jx55 <= nks; jx55++) {
		double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0;
		      vkx = vkv[jx55 - 1];
		double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0;
		      double vkz = vkzm[jx55 - 1][jy60];
		sumx *= factor1*factor2;
		      dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z));
		      sumx += (w[jx55 - 1][jy60] * phas);
		    } // jx55 loop
		    if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5;
		sumy += sumx;
		sumy += sumx;
		  } // jy60 loop
	      } // jy60x55 loop
		  tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks;
	      vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks;
		} // ix65 loop
	    // 	} // ix65 loop
	      } // iy70 loop
	    //   } // iy70 loop
	    } // iz75 loop
	    // } // iz75 loop
	    delete[] vec_w;
	    } // ixyz loop
	    delete[] w;
	    free(vec_w);
	    free(w);
	    // delete[] wk_local;
	    // delete[] wk_local;
	  } // j80 loop
	  } // j80 loop
#ifdef USE_NVTX
#ifdef USE_NVTX
@@ -499,3 +534,4 @@ void frfme(string data_file, string output_path) {
  nvtxRangePop();
  nvtxRangePop();
#endif
#endif
}
}