Flatten j80 loop and add logging on number of J iterations (a4af2237) · Commits · Giacomo Mulas / NP_TMcode

src/trapping/cfrfme.cpp

+25 −26

Original line number	Diff line number	Diff line
		@@ -467,6 +467,8 @@ void frfme(string data_file, string output_path) {
		dcomplex *vec_wsum = tfrfme->wsum[0];
		double *vec_vkzm = vkzm[0];
		dcomplex *global_vec_w = new dcomplex[size_global_vec_w];
		message = "INFO: looping over " + to_string(jlml - jlmf + 1) + " J iterations.\n";
		logger.log(message);
		#ifdef USE_TARGET_OFFLOAD
		t_end = chrono::high_resolution_clock::now();
		elapsed = t_start - t_end;
		@@ -664,10 +666,16 @@ void offload_loop(
		} // jxy50 loop
		}

		// Kernel 2: run the calculation
		#pragma omp target teams distribute parallel for collapse(2)
		for (int j80 = jlmf - 1; j80 < jlml; j80++) {
		// Kernel 2: ensure that vec_wsum is initialized at 0
		#pragma omp target teams distribute parallel for
		for (long i = 0; i < size_vec_wsum; i++)
		vec_wsum[i] = cc0;

		// Kernel 3: run the calculation
		#pragma omp target teams distribute parallel for collapse(3)
		for (int ixyz = 0; ixyz < nvtot; ixyz++) {
		for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
		for (int j80 = jlmf - 1; j80 < jlml; j80++) {
		int j80_index = j80 - jlmf + 1;
		dcomplex vec_w = global_vec_w + nkvs j80_index;
		int iz75 = ixyz / nvxy;
		@@ -676,15 +684,12 @@ void offload_loop(
		double z = _zv[iz75] + frsh;
		double y = _yv[iy70];
		double x = _xv[ix65];
		double rsumy = 0.0;
		double isumy = 0.0;
		#pragma omp parallel for reduction(+:rsumy, isumy)
		for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
		int jy60 = jy60x55 / nkv;
		int jx55 = jy60x55 % nkv;
		int w_index = (jx55 * nkv) + jy60;
		double vky = vkv[jy60];
		double factor = (jy60 == 0 \|\| jy60 == nkvmo) ? 0.5 : 1.0;
		long wsum_index = (j80_index * nvtot) + ixyz;
		if (jx55 == 0) {
		// jx55 = 0: phasf
		double vkx = vkv[nkvmo];
		@@ -695,8 +700,7 @@ void offload_loop(
		dcomplex phasf = c + uim * s;
		dcomplex term = vec_w[jy60] * phasf * 0.5;
		term *= factor;
		rsumy += (real(term));
		isumy += (imag(term));
		vec_wsum[wsum_index] += term * delks;
		} else if (jx55 == nkvmo) {
		// jx55 = nkv - 1: phasl
		double vkx = vkv[nkvmo];
		@@ -707,8 +711,7 @@ void offload_loop(
		dcomplex phasl = c + uim * s;
		dcomplex term = vec_w[nkvvmo + jy60] * phasl * 0.5;
		term *= factor;
		rsumy += (real(term));
		isumy += (imag(term));
		vec_wsum[wsum_index] += term * delks;
		} else {
		// 1 <= jx55 < nkv - 1
		double vkx = vkv[jx55];
		@@ -719,15 +722,11 @@ void offload_loop(
		dcomplex phas = c + uim * s;
		dcomplex term = vec_w[w_index] * phas;
		term *= factor;
		rsumy += (real(term));
		isumy += (imag(term));
		vec_wsum[wsum_index] += term * delks;
		}
		} // jy60x55 loop
		dcomplex sumy = rsumy + uim * isumy;
		vec_wsum[(j80_index * nvtot) + ixyz] = sumy * delks;
		} // ixyz loop
		} // j80 loop

		} // target region
		}
		#endif // USE TARGET_OFFLOAD