Merge branch 'master' into script_devel updating to GPU optimized trapping (03adc9d6) · Commits · Giacomo Mulas / NP_TMcode

build/configure.sh

+6 −6

Original line number	Diff line number	Diff line
		@@ -364,8 +364,8 @@ result=$?
		if [ "x$result" = "x0" ]; then
		CLANGFLAGS=" -stdlib=libstdc++"
		fi
		echo -n "configure: checking wether $CXX works... "
		echo -n "configure: checking wether $CXX works... " >>configure.log
		echo -n "configure: checking whether $CXX works... "
		echo -n "configure: checking whether $CXX works... " >>configure.log
		cat > test_compiler.cpp <<EOF
		int main() {
		int i = -1;
		@@ -385,8 +385,8 @@ else
		echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log
		exit 2
		fi
		echo -n "configure: checking wether $CXX supports -ggdb... "
		echo -n "configure: checking wether $CXX supports -ggdb... " >>configure.log
		echo -n "configure: checking whether $CXX supports -ggdb... "
		echo -n "configure: checking whether $CXX supports -ggdb... " >>configure.log
		$CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log
		result=$?
		if [ "x$result" = "x0" ]; then
		@@ -410,8 +410,8 @@ else
		echo "no"
		echo "no" >>configure.log
		fi
		echo -n "configure: checking wether $CXX is a MPI compiler... "
		echo -n "configure: checking wether $CXX is a MPI compiler... " >>configure.log
		echo -n "configure: checking whether $CXX is a MPI compiler... "
		echo -n "configure: checking whether $CXX is a MPI compiler... " >>configure.log
		cat > test_compiler.cpp <<EOF
		# include <mpi.h>
		int main() {

src/include/tfrfme.h

+20 −30

Original line number	Diff line number	Diff line
		@@ -28,7 +28,7 @@ class Swap1 {
		protected:
		//! Index of the last element to be filled.
		int _last_index;
		//! Number of vector coordinates. QUESTION: correct?
		//! Number of beam description wave-numbers.
		int _nkv;
		//! NLMMT = 2 * LM * (LM + 2)
		int _nlmmt;
		@@ -69,7 +69,7 @@ public:
		/*! \brief Swap1 instance constructor.
		*
		* \param lm: `int` Maximum field expansion order.
		* \param _nkv: `int` Number of vector coordinates. QUESTION: correct?
		* \param nkv: `int` Number of beam description wave numbers.
		*/
		Swap1(int lm, int nkv);

		@@ -130,10 +130,8 @@ protected:
		int _last_vector;
		//! Index of the last matrix element to be filled.
		int _last_matrix;
		//! Number of vector coordinates. QUESTION: correct?
		//! Number of beam description wave numbers.
		int _nkv;
		//! Contiguous vector of VKZM matrix.
		double *vec_vkzm;
		//! QUESTION: definition?
		double _apfafa;
		//! QUESTION: definition?
		@@ -152,13 +150,13 @@ protected:
		double _delxyz;
		//! QUESTION: definition?
		double _vknmx;
		//! QUESTION: definition?
		//! Wave number grid spacing.
		double _delk;
		//! QUESTION: definition?
		//! Square of wave number grid spacing.
		double _delks;
		//! NLMMT = LM * (LM + 2) * 2
		int _nlmmt;
		//! Number of radial vector coordinates. QUESTION: correct?
		//! Number of radial vector coordinates.
		int _nrvc;

		/*! \brief Load a Swap2 instance from a HDF5 binary file.
		@@ -192,12 +190,12 @@ public:
		const int &last_vector = _last_vector;
		//! Read-only view on the index of the last matrix element to be filled.
		const int &last_matrix = _last_matrix;
		//! Read-only view on the number of vector coordinates. QUESTION: correct?
		//! Read-only view on the number of beam description wave numbers.
		const int &nkv = _nkv;
		//! QUESTION: definition?
		double *vkv;
		//! QUESTION: definition?
		double **vkzm;
		double *vec_vkzm;
		//! QUESTION: definition?
		const double &apfafa = _apfafa;
		//! QUESTION: definition?
		@@ -222,12 +220,12 @@ public:
		const double &delks = _delks;
		//! NLMMT = LM * (LM + 2) * 2
		const int &nlmmt = _nlmmt;
		//! Number of radial vector coordinates. QUESTION: correct?
		//! Read-only view on the number of radial vector coordinates.
		const int &nrvc = _nrvc;

		/*! \brief Swap2 instance constructor.
		*
		* \param nkv: `int` Number of vector coordinates. QUESTION: correct?
		* \param nkv: `int` Number of beam description wave numbers.
		*/
		Swap2(int nkv);

		@@ -244,15 +242,9 @@ public:
		*/
		static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY");

		/*! \brief Get the pointer to the VKZM matrix.
		*
		* \return value: `double **` Pointer to the VKZM matrix.
		*/
		double **get_matrix() { return vkzm; }

		/*! \brief Calculate the necessary amount of memory to create a new instance.
		*
		* \param nkv: `int` Number of radial vector coordinates. QUESTION: correct?
		* \param nkv: `int` Number of beam description wave numbers.
		* \return size: `long` The necessary memory size in bytes.
		*/
		static long get_size(int nkv);
		@@ -316,11 +308,11 @@ protected:
		int _nlmmt;
		//! NRVC = NXV * NYV * NZV
		int _nrvc;
		//! Field expansion mode identifier.
		//! Beam description mode.
		int _lmode;
		//! Maximum field expansion order.
		int _lm;
		//! QUESTION: definition?
		//! Number of beam description wave numbers.
		int _nkv;
		//! Number of computed X coordinates.
		int _nxv;
		@@ -332,11 +324,11 @@ protected:
		double _vk;
		//! External medium refractive index
		double _exri;
		//! QUESTION: definition?
		//! Numerical aperture.
		double _an;
		//! QUESTION: definition?
		//! Filling factor.
		double _ff;
		//! QUESTION: definition?
		//! Lens transmission.
		double _tra;
		//! QUESTION: definition?
		double _spd;
		@@ -350,8 +342,6 @@ protected:
		double *yv;
		//! Vector of computed z positions
		double *zv;
		//! QUESTION: definition?
		dcomplex *vec_wsum;

		/*! \brief Load a configuration instance from a HDF5 binary file.
		*
		@@ -402,11 +392,11 @@ public:
		const double& vk = _vk;
		//! Read-only view on external medium refractive index
		const double& exri = _exri;
		//! QUESTION: definition?
		//! Read-only view on numeric aperture.
		const double& an = _an;
		//! QUESTION: definition?
		//! Read-only view on filling factor.
		const double& ff = _ff;
		//! QUESTION: definition?
		//! Read-only view on lens transmission.
		const double& tra = _tra;
		//! QUESTION: definition?
		const double& spd = _spd;
		@@ -415,7 +405,7 @@ public:
		//! QUESTION: definition?
		const double& exril = _exril;
		//! QUESTION: definition?
		dcomplex **wsum;
		dcomplex *vec_wsum;

		/*! \brief Trapping configuration instance constructor.
		*

src/libnptm/Configuration.cpp

+4 −2

Original line number	Diff line number	Diff line
		@@ -340,13 +340,15 @@ GeometryConfiguration* GeometryConfiguration::from_legacy(const std::string& fil
		// Read optional configuration data used only by the C++ code.
		while (num_lines > last_read_line) {
		str_target = file_lines[last_read_line++];
		if (str_target.size() > 0) {
		if (str_target.size() > 15) {
		if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) {
		regex_search(str_target, m, re);
		short refine_flag = (short)stoi(m.str());
		conf->_refine_flag = refine_flag;
		}
		else if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) {
		}
		if (str_target.size() > 14) {
		if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) {
		regex_search(str_target, m, re);
		short dyn_order_flag = (short)stoi(m.str());
		conf->_dyn_order_flag = dyn_order_flag;

src/libnptm/clu_subs.cpp

+115 −102

Original line number	Diff line number	Diff line
		@@ -47,10 +47,6 @@
		#include <omp.h>
		#endif

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp requires unified_shared_memory
		#endif

		using namespace std;

		void apc(
		@@ -407,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) {
		return result;
		}

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		double cgev(int ipamo, int mu, int l, int m) {
		double result = 0.0;
		double xd = 0.0, xn = 0.0;
		@@ -443,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) {
		}
		return result;
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		void cms(dcomplex *am, ParticleDescriptor c1) {
		dcomplex dm, de, cgh, cgk;
		@@ -649,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) {
		delete[] svs;
		}

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		dcomplex ghit_d(
		int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
		ParticleDescriptor c1, double rac3j
		@@ -862,13 +858,13 @@ dcomplex ghit_d(
		}
		return result;
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		dcomplex ghit(
		int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
		ParticleDescriptor *c1
		@@ -1079,9 +1075,9 @@ dcomplex ghit(
		}
		return result;
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		void hjv(
		double exri, double vk, int &jer, int &lcalc, dcomplex &arg,
		@@ -1339,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) {
		#ifdef USE_NVTX
		nvtxRangePush("pcros intermediate loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		// #else
		// #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		// #endif
		#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		#endif
		for (int i12 = 0; i12 < nlemt; i12++) {
		// int i = i12 - 1;
		dcomplex am = cc0;
		@@ -1408,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
		csam = -(ccs / (exri * vk)) * 0.5 * I;
		sum2 = cc0;
		sum3 = cc0;
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
		// #else
		// #pragma omp parallel for simd reduction(+:sum2,sum3)
		// #endif
		#pragma omp parallel for simd reduction(+:sum2,sum3)
		#endif
		for (int i14 = 0; i14 < c1->nlem; i14++) {
		int ie = i14 + c1->nlem;
		sum2 += (vec_am0m[nlemti14 + i14] + vec_am0m[nlemtie + ie]);
		@@ -1420,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
		} // i14 loop
		double sumpi = 0.0;
		dcomplex sumpd = cc0;
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		// #else
		// #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		// #endif
		#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		#endif
		for (int i16 = 0; i16 < nlemt; i16++) {
		for (int j16 = 0; j16 < c1->nlem; j16++) {
		int je = j16 + c1->nlem;
		@@ -1628,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) {
		}
		}

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
		int jmx = j3 + j2;
		int jdf = j3 - j2;
		@@ -1748,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
		}
		}
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
		int jmx = j3 + j2;
		int jdf = j3 - j2;
		@@ -1872,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
		}
		}
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) {
		int mmx = (j2 < j3 - m1) ? j2 : j3 - m1;
		@@ -2005,11 +2004,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba inner loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
		// #else
		// #pragma omp parallel for simd reduction(+:c1, c2)
		// #endif
		#pragma omp parallel for simd reduction(+:c1, c2)
		#endif
		for (int j10 = 1; j10 <= nlemt; j10++) {
		int j = j10 - 1;
		c1 += (vec_am0m[inlemt+j] vec_w[4*j]);
		@@ -2027,11 +2027,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba outer loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp teams distribute parallel for
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp teams distribute parallel for
		// #else
		// #pragma omp parallel for
		// #endif
		#pragma omp parallel for
		#endif
		for (int ipo = 0; ipo < 2; ipo++) {
		int jpo = 1 - ipo;
		ctqce[ipo][0] = cc0;
		@@ -2063,11 +2064,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba inner loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		// #else
		// #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		// #endif
		#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		#endif
		for (int k = 1; k<=kmax; k++) {
		int l60 = (int) sqrt(k+1);
		int im60 = k - (l60*l60) + 1;
		@@ -2140,11 +2142,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba loop 3");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd
		// #else
		// #pragma omp parallel for simd
		// #endif
		#pragma omp parallel for simd
		#endif
		for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
		int ipo = ipo78 - 1;
		tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i;
		@@ -2214,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
		#ifdef USE_NVTX
		nvtxRangePush("scr0 inner loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
		// #else
		// #pragma omp parallel for simd reduction(+:sums, sum21)
		// #endif
		#pragma omp parallel for simd reduction(+:sums, sum21)
		#endif
		for (int l10 = 1; l10 <= c1->li; l10++) {
		double fl = 1.0 * (l10 + l10 + 1);
		// dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1];
		@@ -2262,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
		#ifdef USE_NVTX
		nvtxRangePush("scr0 loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
		// #else
		// #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
		// #endif
		#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
		#endif
		for (int i14 = 1; i14 <= c1->nsph; i14++) {
		int iogi = c1->iog[i14 - 1];
		scs += c1->sscs[iogi - 1];
		@@ -2328,11 +2333,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 inner loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
		// #else
		// #pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
		// #endif
		#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
		#endif
		for (int k = 1; k<=kmax; k++) {
		int l10 = (int) sqrt(k+1);
		int im10 = k - (l10*l10) + 1;
		@@ -2384,11 +2390,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		// #else
		// #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		// #endif
		#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		#endif
		for (int i14 = 1; i14 <= c1->nsph; i14++) {
		int i = i14 - 1;
		int iogi = c1->iog[i14 - 1];
		@@ -2418,11 +2425,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 inner loop 3");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(4)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(4)
		// #else
		// #pragma omp parallel for simd collapse(4)
		// #endif
		#pragma omp parallel for simd collapse(4)
		#endif
		for (int ipo1 = 1; ipo1 <=2; ipo1++) {
		for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
		for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
		@@ -2445,11 +2453,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 loop 4");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for collapse(4)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for collapse(4)
		// #else
		// #pragma omp parallel for collapse(4)
		// #endif
		#pragma omp parallel for collapse(4)
		#endif
		for (int ipo1 = 1; ipo1 <=2; ipo1++) {
		for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
		for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
		@@ -2582,11 +2591,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		// but if it results im = 0, then we set l = l-1 and im = 2*l+1
		// furthermore if it results im > 2*l+1, then we set
		// im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(3)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(3)
		// #else
		// #pragma omp parallel for simd collapse(3)
		// #endif
		#pragma omp parallel for simd collapse(3)
		#endif
		for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
		for (int k2 = 1; k2<=k2max; k2++) {
		for (int k3 = 1; k3<=k3max; k3++) {
		@@ -2632,11 +2642,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		#endif
		dcomplex *am_v = am[0];
		dcomplex *sam_v = c1->sam[0];
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2)
		// #else
		// #pragma omp parallel for simd collapse(2)
		// #endif
		#pragma omp parallel for simd collapse(2)
		#endif
		for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
		for (int i3 = 1; i3 <= c1->nlem; i3++) {
		dcomplex sum1 = cc0;
		@@ -2669,11 +2680,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		sam_v[vecind1e + i3e - 1] = sum4;
		} // i3 loop
		} // i1 loop
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2)
		// #else
		// #pragma omp parallel for simd collapse(2)
		// #endif
		#pragma omp parallel for simd collapse(2)
		#endif
		for (int i1 = 1; i1 <= c1->ndi; i1++) {
		for (int i0 = 1; i0 <= c1->nlem; i0++) {
		int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
		@@ -2682,11 +2694,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		} // i0 loop
		} // i1 loop
		dcomplex *vec_am0m = c1->am0m[0];
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2)
		// #else
		// #pragma omp parallel for simd collapse(2)
		// #endif
		#pragma omp parallel for simd collapse(2)
		#endif
		for (int i0 = 1; i0 <= c1->nlem; i0++) {
		for (int i3 = 1; i3 <= c1->nlemt; i3++) {
		int i0e = i0 + c1->nlem;

src/libnptm/tfrfme.cpp

+54 −23

Original line number	Diff line number	Diff line
		@@ -44,6 +44,10 @@
		#include "../include/file_io.h"
		#endif

		#ifdef USE_TARGET_OFFLOAD
		#include <cstdlib>
		#endif

		using namespace std;

		// >>> START OF Swap1 CLASS IMPLEMENTATION <<<
		@@ -220,18 +224,32 @@ bool Swap1::operator ==(Swap1 &other) {
		// >>> START OF Swap2 CLASS IMPLEMENTATION <<<
		Swap2::Swap2(int nkv) {
		_nkv = nkv;
		#ifdef USE_TARGET_OFFLOAD
		vkv = (double )aligned_alloc(64, _nkv sizeof(double));
		vec_vkzm = (double )aligned_alloc(64, _nkv _nkv * sizeof(double));
		#pragma omp parallel for collapse(2)
		for (int i = 0; i < _nkv; i++) {
		for (int j = 0; j < _nkv; j++) {
		vkv[i] = 0.0;
		vec_vkzm[_nkv * i +j] = 0.0;
		}
		}
		#else
		vkv = new double[_nkv]();
		vec_vkzm = new double[_nkv * _nkv]();
		vkzm = new double*[nkv];
		for (int vi = 0; vi < _nkv; vi++) vkzm[vi] = vec_vkzm + vi * _nkv;
		#endif // USE TARGET_OFFLOAD
		_last_vector = 0;
		_last_matrix = 0;
		}

		Swap2::~Swap2() {
		#ifdef USE_TARGET_OFFLOAD
		free(vkv);
		free(vec_vkzm);
		#else
		delete[] vkv;
		delete[] vec_vkzm;
		delete[] vkzm;
		#endif // USE_TARGET_OFFLOAD
		}

		Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) {
		@@ -298,14 +316,14 @@ Swap2* Swap2::from_legacy(const std::string& file_name) {
		fstream input;
		Swap2 *instance = NULL;
		int fnkv, fnlmmt, fnrvc;
		double **fvkzm = NULL;
		double *fvkzm = NULL;
		double *fvkv = NULL;
		double value;
		input.open(file_name.c_str(), ios::in \| ios::binary);
		if (input.is_open()) {
		input.read(reinterpret_cast<char *>(&fnkv), sizeof(int));
		instance = new Swap2(fnkv);
		fvkzm = instance->get_matrix();
		fvkzm = instance->vec_vkzm;
		fvkv = instance->get_vector();
		for (int vj = 0; vj < fnkv; vj++) {
		input.read(reinterpret_cast<char *>(&value), sizeof(double));
		@@ -314,7 +332,7 @@ Swap2* Swap2::from_legacy(const std::string& file_name) {
		for (int mi = 0; mi < fnkv; mi++) {
		for (int mj = 0; mj < fnkv; mj++) {
		input.read(reinterpret_cast<char *>(&value), sizeof(double));
		fvkzm[mi][mj] = value;
		fvkzm[fnkv * mi + mj] = value;
		}
		}
		input.read(reinterpret_cast<char *>(&value), sizeof(double));
		@@ -359,7 +377,7 @@ long Swap2::get_size(int nkv) {
		void Swap2::push_matrix(double value) {
		int col = _last_matrix % (_nkv - 1);
		int row = _last_matrix - (_nkv * row);
		vkzm[row][col] = value;
		vec_vkzm[nkv * row + col] = value;
		_last_matrix++;
		}

		@@ -480,7 +498,7 @@ void Swap2::write_legacy(const std::string& file_name) {
		}
		for (int mi = 0; mi < _nkv; mi++) {
		for (int mj = 0; mj < _nkv; mj++) {
		value = vkzm[mi][mj];
		value = vec_vkzm[nkv * mi + mj];
		output.write(reinterpret_cast<const char*>(&value), sizeof(double));
		}
		}
		@@ -552,8 +570,9 @@ bool Swap2::operator ==(Swap2 &other) {
		}
		}
		for (int mi = 0; mi < _nkv; mi++) {
		int nkvi = nkv * mi;
		for (int mj = 0; mj < _nkv; mj++) {
		if (vkzm[mi][mj] != other.vkzm[mi][mj]) {
		if (vec_vkzm[nkvi + mj] != other.vec_vkzm[nkvi + mj]) {
		return false;
		}
		}
		@@ -580,22 +599,33 @@ TFRFME::TFRFME(int lmode, int lm, int nkv, int nxv, int nyv, int nzv) {
		_exril = 0.0;

		// Array initialization
		xv = new double[nxv]();
		yv = new double[nyv]();
		zv = new double[nzv]();
		_nlmmt = _lm * (_lm + 2) * 2;
		_nrvc = _nxv * _nyv * _nzv;
		vec_wsum = new dcomplex[nrvc * nlmmt]();
		wsum = new dcomplex*[nlmmt];
		for (int wi = 0; wi < nlmmt; wi++) wsum[wi] = vec_wsum + wi * nrvc;
		#ifdef USE_TARGET_OFFLOAD
		xv = (double )aligned_alloc(64, sizeof(double) _nxv);
		yv = (double )aligned_alloc(64, sizeof(double) _nyv);
		zv = (double )aligned_alloc(64, sizeof(double) _nzv);
		vec_wsum = (dcomplex )aligned_alloc(64, sizeof(dcomplex) _nrvc * _nlmmt);
		#else
		xv = new double[_nxv]();
		yv = new double[_nyv]();
		zv = new double[_nzv]();
		vec_wsum = new dcomplex[_nrvc * _nlmmt]();
		#endif // USE_TARGET_OFFLOAD
		}

		TFRFME::~TFRFME() {
		#ifdef USE_TARGET_OFFLOAD
		free(xv);
		free(yv);
		free(zv);
		free(vec_wsum);
		#else
		delete[] xv;
		delete[] yv;
		delete[] zv;
		delete[] vec_wsum;
		delete[] wsum;
		#endif
		}

		TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) {
		@@ -662,7 +692,7 @@ TFRFME* TFRFME::from_hdf5(const std::string& file_name) {
		for (int wj = 0; wj < nrvc; wj++) {
		for (int wi = 0; wi < nlmmt; wi++) {
		value = elements[index] + elements[index + 1] * I;
		instance->wsum[wi][wj] = value;
		instance->vec_wsum[nrvc * wi + wj] = value;
		index += 2;
		} // wi loop
		} // wj loop
		@@ -727,7 +757,7 @@ TFRFME* TFRFME::from_legacy(const std::string& file_name) {
		input.read(reinterpret_cast<char *>(&rval), sizeof(double));
		input.read(reinterpret_cast<char *>(&ival), sizeof(double));
		dcomplex value = rval + ival * I;
		instance->wsum[wi][wj] = value;
		instance->vec_wsum[nrvc * wi + wj] = value;
		} // wi loop
		} // wj loop
		input.close();
		@@ -842,8 +872,8 @@ void TFRFME::write_hdf5(const std::string& file_name) {
		int index = 0;
		for (int wj = 0; wj < nrvc; wj++) {
		for (int wi = 0; wi < nlmmt; wi++) {
		ptr_elements[index++] = real(wsum[wi][wj]);
		ptr_elements[index++] = imag(wsum[wi][wj]);
		ptr_elements[index++] = real(vec_wsum[nrvc * wi + wj]);
		ptr_elements[index++] = imag(vec_wsum[nrvc * wi + wj]);
		} // wi loop
		} // wj loop
		rec_ptr_list.append(ptr_elements);
		@@ -891,8 +921,8 @@ void TFRFME::write_legacy(const std::string& file_name) {
		output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double));
		for (int wj = 0; wj < _nrvc; wj++) {
		for (int wi = 0; wi < _nlmmt; wi++) {
		double rval = real(wsum[wi][wj]);
		double ival = imag(wsum[wi][wj]);
		double rval = real(vec_wsum[nrvc * wi + wj]);
		double ival = imag(vec_wsum[nrvc * wi + wj]);
		output.write(reinterpret_cast<char *>(&rval), sizeof(double));
		output.write(reinterpret_cast<char *>(&ival), sizeof(double));
		} // wi loop
		@@ -962,8 +992,9 @@ bool TFRFME::operator ==(const TFRFME& other) {
		}
		}
		for (int wi = 0; wi < _nlmmt; wi++) {
		int i = _nrvc * wi;
		for (int wj = 0; wj < _nrvc; wj++) {
		if (wsum[wi][wj] != other.wsum[wi][wj]) {
		if (vec_wsum[i + wj] != other.vec_wsum[i + wj]) {
		return false;
		}
		} // wj loop