Merge branch 'master' into script_devel updating to GPU optimized trapping (03adc9d6) · Commits · Giacomo Mulas / NP_TMcode

build/configure.sh

+6 −6

Original line number	Original line	Diff line number	Diff line
	@@ -364,8 +364,8 @@ result=$?
	if [ "x$result" = "x0" ]; then		if [ "x$result" = "x0" ]; then
	CLANGFLAGS=" -stdlib=libstdc++"		CLANGFLAGS=" -stdlib=libstdc++"
	fi		fi
	echo -n "configure: checking wether $CXX works... "		echo -n "configure: checking whether $CXX works... "
	echo -n "configure: checking wether $CXX works... " >>configure.log		echo -n "configure: checking whether $CXX works... " >>configure.log
	cat > test_compiler.cpp <<EOF		cat > test_compiler.cpp <<EOF
	int main() {		int main() {
	int i = -1;		int i = -1;
	@@ -385,8 +385,8 @@ else
	echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log		echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log
	exit 2		exit 2
	fi		fi
	echo -n "configure: checking wether $CXX supports -ggdb... "		echo -n "configure: checking whether $CXX supports -ggdb... "
	echo -n "configure: checking wether $CXX supports -ggdb... " >>configure.log		echo -n "configure: checking whether $CXX supports -ggdb... " >>configure.log
	$CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log		$CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log
	result=$?		result=$?
	if [ "x$result" = "x0" ]; then		if [ "x$result" = "x0" ]; then
	@@ -410,8 +410,8 @@ else
	echo "no"		echo "no"
	echo "no" >>configure.log		echo "no" >>configure.log
	fi		fi
	echo -n "configure: checking wether $CXX is a MPI compiler... "		echo -n "configure: checking whether $CXX is a MPI compiler... "
	echo -n "configure: checking wether $CXX is a MPI compiler... " >>configure.log		echo -n "configure: checking whether $CXX is a MPI compiler... " >>configure.log
	cat > test_compiler.cpp <<EOF		cat > test_compiler.cpp <<EOF
	# include <mpi.h>		# include <mpi.h>
	int main() {		int main() {

src/include/tfrfme.h

+20 −30

Original line number	Original line	Diff line number	Diff line
	@@ -28,7 +28,7 @@ class Swap1 {
	protected:		protected:
	//! Index of the last element to be filled.		//! Index of the last element to be filled.
	int _last_index;		int _last_index;
	//! Number of vector coordinates. QUESTION: correct?		//! Number of beam description wave-numbers.
	int _nkv;		int _nkv;
	//! NLMMT = 2 * LM * (LM + 2)		//! NLMMT = 2 * LM * (LM + 2)
	int _nlmmt;		int _nlmmt;
	@@ -69,7 +69,7 @@ public:
	/*! \brief Swap1 instance constructor.		/*! \brief Swap1 instance constructor.
	*		*
	* \param lm: `int` Maximum field expansion order.		* \param lm: `int` Maximum field expansion order.
	* \param _nkv: `int` Number of vector coordinates. QUESTION: correct?		* \param nkv: `int` Number of beam description wave numbers.
	*/		*/
	Swap1(int lm, int nkv);		Swap1(int lm, int nkv);

	@@ -130,10 +130,8 @@ protected:
	int _last_vector;		int _last_vector;
	//! Index of the last matrix element to be filled.		//! Index of the last matrix element to be filled.
	int _last_matrix;		int _last_matrix;
	//! Number of vector coordinates. QUESTION: correct?		//! Number of beam description wave numbers.
	int _nkv;		int _nkv;
	//! Contiguous vector of VKZM matrix.
	double *vec_vkzm;
	//! QUESTION: definition?		//! QUESTION: definition?
	double _apfafa;		double _apfafa;
	//! QUESTION: definition?		//! QUESTION: definition?
	@@ -152,13 +150,13 @@ protected:
	double _delxyz;		double _delxyz;
	//! QUESTION: definition?		//! QUESTION: definition?
	double _vknmx;		double _vknmx;
	//! QUESTION: definition?		//! Wave number grid spacing.
	double _delk;		double _delk;
	//! QUESTION: definition?		//! Square of wave number grid spacing.
	double _delks;		double _delks;
	//! NLMMT = LM * (LM + 2) * 2		//! NLMMT = LM * (LM + 2) * 2
	int _nlmmt;		int _nlmmt;
	//! Number of radial vector coordinates. QUESTION: correct?		//! Number of radial vector coordinates.
	int _nrvc;		int _nrvc;

	/*! \brief Load a Swap2 instance from a HDF5 binary file.		/*! \brief Load a Swap2 instance from a HDF5 binary file.
	@@ -192,12 +190,12 @@ public:
	const int &last_vector = _last_vector;		const int &last_vector = _last_vector;
	//! Read-only view on the index of the last matrix element to be filled.		//! Read-only view on the index of the last matrix element to be filled.
	const int &last_matrix = _last_matrix;		const int &last_matrix = _last_matrix;
	//! Read-only view on the number of vector coordinates. QUESTION: correct?		//! Read-only view on the number of beam description wave numbers.
	const int &nkv = _nkv;		const int &nkv = _nkv;
	//! QUESTION: definition?		//! QUESTION: definition?
	double *vkv;		double *vkv;
	//! QUESTION: definition?		//! QUESTION: definition?
	double **vkzm;		double *vec_vkzm;
	//! QUESTION: definition?		//! QUESTION: definition?
	const double &apfafa = _apfafa;		const double &apfafa = _apfafa;
	//! QUESTION: definition?		//! QUESTION: definition?
	@@ -222,12 +220,12 @@ public:
	const double &delks = _delks;		const double &delks = _delks;
	//! NLMMT = LM * (LM + 2) * 2		//! NLMMT = LM * (LM + 2) * 2
	const int &nlmmt = _nlmmt;		const int &nlmmt = _nlmmt;
	//! Number of radial vector coordinates. QUESTION: correct?		//! Read-only view on the number of radial vector coordinates.
	const int &nrvc = _nrvc;		const int &nrvc = _nrvc;

	/*! \brief Swap2 instance constructor.		/*! \brief Swap2 instance constructor.
	*		*
	* \param nkv: `int` Number of vector coordinates. QUESTION: correct?		* \param nkv: `int` Number of beam description wave numbers.
	*/		*/
	Swap2(int nkv);		Swap2(int nkv);

	@@ -244,15 +242,9 @@ public:
	*/		*/
	static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY");		static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY");

	/*! \brief Get the pointer to the VKZM matrix.
	*
	* \return value: `double **` Pointer to the VKZM matrix.
	*/
	double **get_matrix() { return vkzm; }

	/*! \brief Calculate the necessary amount of memory to create a new instance.		/*! \brief Calculate the necessary amount of memory to create a new instance.
	*		*
	* \param nkv: `int` Number of radial vector coordinates. QUESTION: correct?		* \param nkv: `int` Number of beam description wave numbers.
	* \return size: `long` The necessary memory size in bytes.		* \return size: `long` The necessary memory size in bytes.
	*/		*/
	static long get_size(int nkv);		static long get_size(int nkv);
	@@ -316,11 +308,11 @@ protected:
	int _nlmmt;		int _nlmmt;
	//! NRVC = NXV * NYV * NZV		//! NRVC = NXV * NYV * NZV
	int _nrvc;		int _nrvc;
	//! Field expansion mode identifier.		//! Beam description mode.
	int _lmode;		int _lmode;
	//! Maximum field expansion order.		//! Maximum field expansion order.
	int _lm;		int _lm;
	//! QUESTION: definition?		//! Number of beam description wave numbers.
	int _nkv;		int _nkv;
	//! Number of computed X coordinates.		//! Number of computed X coordinates.
	int _nxv;		int _nxv;
	@@ -332,11 +324,11 @@ protected:
	double _vk;		double _vk;
	//! External medium refractive index		//! External medium refractive index
	double _exri;		double _exri;
	//! QUESTION: definition?		//! Numerical aperture.
	double _an;		double _an;
	//! QUESTION: definition?		//! Filling factor.
	double _ff;		double _ff;
	//! QUESTION: definition?		//! Lens transmission.
	double _tra;		double _tra;
	//! QUESTION: definition?		//! QUESTION: definition?
	double _spd;		double _spd;
	@@ -350,8 +342,6 @@ protected:
	double *yv;		double *yv;
	//! Vector of computed z positions		//! Vector of computed z positions
	double *zv;		double *zv;
	//! QUESTION: definition?
	dcomplex *vec_wsum;

	/*! \brief Load a configuration instance from a HDF5 binary file.		/*! \brief Load a configuration instance from a HDF5 binary file.
	*		*
	@@ -402,11 +392,11 @@ public:
	const double& vk = _vk;		const double& vk = _vk;
	//! Read-only view on external medium refractive index		//! Read-only view on external medium refractive index
	const double& exri = _exri;		const double& exri = _exri;
	//! QUESTION: definition?		//! Read-only view on numeric aperture.
	const double& an = _an;		const double& an = _an;
	//! QUESTION: definition?		//! Read-only view on filling factor.
	const double& ff = _ff;		const double& ff = _ff;
	//! QUESTION: definition?		//! Read-only view on lens transmission.
	const double& tra = _tra;		const double& tra = _tra;
	//! QUESTION: definition?		//! QUESTION: definition?
	const double& spd = _spd;		const double& spd = _spd;
	@@ -415,7 +405,7 @@ public:
	//! QUESTION: definition?		//! QUESTION: definition?
	const double& exril = _exril;		const double& exril = _exril;
	//! QUESTION: definition?		//! QUESTION: definition?
	dcomplex **wsum;		dcomplex *vec_wsum;

	/*! \brief Trapping configuration instance constructor.		/*! \brief Trapping configuration instance constructor.
	*		*

src/libnptm/Configuration.cpp

+4 −2

src/libnptm/clu_subs.cpp

+115 −102

Original line number	Original line	Diff line number	Diff line
	@@ -47,10 +47,6 @@
	#include <omp.h>		#include <omp.h>
	#endif		#endif

	#ifdef USE_TARGET_OFFLOAD
	#pragma omp requires unified_shared_memory
	#endif

	using namespace std;		using namespace std;

	void apc(		void apc(
	@@ -407,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) {
	return result;		return result;
	}		}

	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp begin declare target device_type(any)		// #pragma omp begin declare target device_type(any)
	#endif		// #endif
	double cgev(int ipamo, int mu, int l, int m) {		double cgev(int ipamo, int mu, int l, int m) {
	double result = 0.0;		double result = 0.0;
	double xd = 0.0, xn = 0.0;		double xd = 0.0, xn = 0.0;
	@@ -443,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) {
	}		}
	return result;		return result;
	}		}
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp end declare target		// #pragma omp end declare target
	#endif		// #endif

	void cms(dcomplex *am, ParticleDescriptor c1) {		void cms(dcomplex *am, ParticleDescriptor c1) {
	dcomplex dm, de, cgh, cgk;		dcomplex dm, de, cgh, cgk;
	@@ -649,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) {
	delete[] svs;		delete[] svs;
	}		}

	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp begin declare target device_type(any)		// #pragma omp begin declare target device_type(any)
	#endif		// #endif
	dcomplex ghit_d(		dcomplex ghit_d(
	int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,		int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	ParticleDescriptor c1, double rac3j		ParticleDescriptor c1, double rac3j
	@@ -862,13 +858,13 @@ dcomplex ghit_d(
	}		}
	return result;		return result;
	}		}
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp end declare target		// #pragma omp end declare target
	#endif		// #endif

	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp begin declare target device_type(any)		// #pragma omp begin declare target device_type(any)
	#endif		// #endif
	dcomplex ghit(		dcomplex ghit(
	int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,		int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	ParticleDescriptor *c1		ParticleDescriptor *c1
	@@ -1079,9 +1075,9 @@ dcomplex ghit(
	}		}
	return result;		return result;
	}		}
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp end declare target		// #pragma omp end declare target
	#endif		// #endif

	void hjv(		void hjv(
	double exri, double vk, int &jer, int &lcalc, dcomplex &arg,		double exri, double vk, int &jer, int &lcalc, dcomplex &arg,
	@@ -1339,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) {
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("pcros intermediate loop 1");		nvtxRangePush("pcros intermediate loop 1");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)		// #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
	#else		// #else
			// #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
			// #endif
	#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)		#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
	#endif
	for (int i12 = 0; i12 < nlemt; i12++) {		for (int i12 = 0; i12 < nlemt; i12++) {
	// int i = i12 - 1;		// int i = i12 - 1;
	dcomplex am = cc0;		dcomplex am = cc0;
	@@ -1408,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
	csam = -(ccs / (exri * vk)) * 0.5 * I;		csam = -(ccs / (exri * vk)) * 0.5 * I;
	sum2 = cc0;		sum2 = cc0;
	sum3 = cc0;		sum3 = cc0;
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)		// #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
	#else		// #else
			// #pragma omp parallel for simd reduction(+:sum2,sum3)
			// #endif
	#pragma omp parallel for simd reduction(+:sum2,sum3)		#pragma omp parallel for simd reduction(+:sum2,sum3)
	#endif
	for (int i14 = 0; i14 < c1->nlem; i14++) {		for (int i14 = 0; i14 < c1->nlem; i14++) {
	int ie = i14 + c1->nlem;		int ie = i14 + c1->nlem;
	sum2 += (vec_am0m[nlemti14 + i14] + vec_am0m[nlemtie + ie]);		sum2 += (vec_am0m[nlemti14 + i14] + vec_am0m[nlemtie + ie]);
	@@ -1420,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
	} // i14 loop		} // i14 loop
	double sumpi = 0.0;		double sumpi = 0.0;
	dcomplex sumpd = cc0;		dcomplex sumpd = cc0;
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)		// #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
	#else		// #else
			// #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
			// #endif
	#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)		#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
	#endif
	for (int i16 = 0; i16 < nlemt; i16++) {		for (int i16 = 0; i16 < nlemt; i16++) {
	for (int j16 = 0; j16 < c1->nlem; j16++) {		for (int j16 = 0; j16 < c1->nlem; j16++) {
	int je = j16 + c1->nlem;		int je = j16 + c1->nlem;
	@@ -1628,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) {
	}		}
	}		}

	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp begin declare target device_type(any)		// #pragma omp begin declare target device_type(any)
	#endif		// #endif
	void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {		void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
	int jmx = j3 + j2;		int jmx = j3 + j2;
	int jdf = j3 - j2;		int jdf = j3 - j2;
	@@ -1748,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
	}		}
	}		}
	}		}
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp end declare target		// #pragma omp end declare target
	#endif		// #endif

	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp begin declare target device_type(any)		// #pragma omp begin declare target device_type(any)
	#endif		// #endif
	void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {		void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
	int jmx = j3 + j2;		int jmx = j3 + j2;
	int jdf = j3 - j2;		int jdf = j3 - j2;
	@@ -1872,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
	}		}
	}		}
	}		}
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp end declare target		// #pragma omp end declare target
	#endif		// #endif

	void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) {		void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) {
	int mmx = (j2 < j3 - m1) ? j2 : j3 - m1;		int mmx = (j2 < j3 - m1) ? j2 : j3 - m1;
	@@ -2005,11 +2004,12 @@ void raba(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("raba inner loop 1");		nvtxRangePush("raba inner loop 1");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)		// #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
	#else		// #else
			// #pragma omp parallel for simd reduction(+:c1, c2)
			// #endif
	#pragma omp parallel for simd reduction(+:c1, c2)		#pragma omp parallel for simd reduction(+:c1, c2)
	#endif
	for (int j10 = 1; j10 <= nlemt; j10++) {		for (int j10 = 1; j10 <= nlemt; j10++) {
	int j = j10 - 1;		int j = j10 - 1;
	c1 += (vec_am0m[inlemt+j] vec_w[4*j]);		c1 += (vec_am0m[inlemt+j] vec_w[4*j]);
	@@ -2027,11 +2027,12 @@ void raba(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("raba outer loop 2");		nvtxRangePush("raba outer loop 2");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp teams distribute parallel for		// #pragma omp teams distribute parallel for
	#else		// #else
			// #pragma omp parallel for
			// #endif
	#pragma omp parallel for		#pragma omp parallel for
	#endif
	for (int ipo = 0; ipo < 2; ipo++) {		for (int ipo = 0; ipo < 2; ipo++) {
	int jpo = 1 - ipo;		int jpo = 1 - ipo;
	ctqce[ipo][0] = cc0;		ctqce[ipo][0] = cc0;
	@@ -2063,11 +2064,12 @@ void raba(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("raba inner loop 2");		nvtxRangePush("raba inner loop 2");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)		// #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
	#else		// #else
			// #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
			// #endif
	#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)		#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
	#endif
	for (int k = 1; k<=kmax; k++) {		for (int k = 1; k<=kmax; k++) {
	int l60 = (int) sqrt(k+1);		int l60 = (int) sqrt(k+1);
	int im60 = k - (l60*l60) + 1;		int im60 = k - (l60*l60) + 1;
	@@ -2140,11 +2142,12 @@ void raba(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("raba loop 3");		nvtxRangePush("raba loop 3");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd		// #pragma omp target teams distribute parallel for simd
	#else		// #else
			// #pragma omp parallel for simd
			// #endif
	#pragma omp parallel for simd		#pragma omp parallel for simd
	#endif
	for (int ipo78 = 1; ipo78 <= 2; ipo78++) {		for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
	int ipo = ipo78 - 1;		int ipo = ipo78 - 1;
	tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i;		tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i;
	@@ -2214,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("scr0 inner loop 1");		nvtxRangePush("scr0 inner loop 1");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)		// #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
	#else		// #else
			// #pragma omp parallel for simd reduction(+:sums, sum21)
			// #endif
	#pragma omp parallel for simd reduction(+:sums, sum21)		#pragma omp parallel for simd reduction(+:sums, sum21)
	#endif
	for (int l10 = 1; l10 <= c1->li; l10++) {		for (int l10 = 1; l10 <= c1->li; l10++) {
	double fl = 1.0 * (l10 + l10 + 1);		double fl = 1.0 * (l10 + l10 + 1);
	// dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1];		// dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1];
	@@ -2262,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("scr0 loop 2");		nvtxRangePush("scr0 loop 2");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)		// #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
	#else		// #else
			// #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
			// #endif
	#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)		#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
	#endif
	for (int i14 = 1; i14 <= c1->nsph; i14++) {		for (int i14 = 1; i14 <= c1->nsph; i14++) {
	int iogi = c1->iog[i14 - 1];		int iogi = c1->iog[i14 - 1];
	scs += c1->sscs[iogi - 1];		scs += c1->sscs[iogi - 1];
	@@ -2328,11 +2333,12 @@ void scr2(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("scr2 inner loop 1");		nvtxRangePush("scr2 inner loop 1");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)		// #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
	#else		// #else
			// #pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
			// #endif
	#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)		#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
	#endif
	for (int k = 1; k<=kmax; k++) {		for (int k = 1; k<=kmax; k++) {
	int l10 = (int) sqrt(k+1);		int l10 = (int) sqrt(k+1);
	int im10 = k - (l10*l10) + 1;		int im10 = k - (l10*l10) + 1;
	@@ -2384,11 +2390,12 @@ void scr2(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("scr2 loop 2");		nvtxRangePush("scr2 loop 2");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)		// #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
	#else		// #else
			// #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
			// #endif
	#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)		#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
	#endif
	for (int i14 = 1; i14 <= c1->nsph; i14++) {		for (int i14 = 1; i14 <= c1->nsph; i14++) {
	int i = i14 - 1;		int i = i14 - 1;
	int iogi = c1->iog[i14 - 1];		int iogi = c1->iog[i14 - 1];
	@@ -2418,11 +2425,12 @@ void scr2(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("scr2 inner loop 3");		nvtxRangePush("scr2 inner loop 3");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd collapse(4)		// #pragma omp target teams distribute parallel for simd collapse(4)
	#else		// #else
			// #pragma omp parallel for simd collapse(4)
			// #endif
	#pragma omp parallel for simd collapse(4)		#pragma omp parallel for simd collapse(4)
	#endif
	for (int ipo1 = 1; ipo1 <=2; ipo1++) {		for (int ipo1 = 1; ipo1 <=2; ipo1++) {
	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {		for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
	for (int ipo2 = 1; ipo2 <= 2; ipo2++) {		for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	@@ -2445,11 +2453,12 @@ void scr2(
	#ifdef USE_NVTX		#ifdef USE_NVTX
	nvtxRangePush("scr2 loop 4");		nvtxRangePush("scr2 loop 4");
	#endif		#endif
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for collapse(4)		// #pragma omp target teams distribute parallel for collapse(4)
	#else		// #else
			// #pragma omp parallel for collapse(4)
			// #endif
	#pragma omp parallel for collapse(4)		#pragma omp parallel for collapse(4)
	#endif
	for (int ipo1 = 1; ipo1 <=2; ipo1++) {		for (int ipo1 = 1; ipo1 <=2; ipo1++) {
	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {		for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
	for (int ipo2 = 1; ipo2 <= 2; ipo2++) {		for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	@@ -2582,11 +2591,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
	// but if it results im = 0, then we set l = l-1 and im = 2*l+1		// but if it results im = 0, then we set l = l-1 and im = 2*l+1
	// furthermore if it results im > 2*l+1, then we set		// furthermore if it results im > 2*l+1, then we set
	// im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)		// im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd collapse(3)		// #pragma omp target teams distribute parallel for simd collapse(3)
	#else		// #else
			// #pragma omp parallel for simd collapse(3)
			// #endif
	#pragma omp parallel for simd collapse(3)		#pragma omp parallel for simd collapse(3)
	#endif
	for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?		for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
	for (int k2 = 1; k2<=k2max; k2++) {		for (int k2 = 1; k2<=k2max; k2++) {
	for (int k3 = 1; k3<=k3max; k3++) {		for (int k3 = 1; k3<=k3max; k3++) {
	@@ -2632,11 +2642,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
	#endif		#endif
	dcomplex *am_v = am[0];		dcomplex *am_v = am[0];
	dcomplex *sam_v = c1->sam[0];		dcomplex *sam_v = c1->sam[0];
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd collapse(2)		// #pragma omp target teams distribute parallel for simd collapse(2)
	#else		// #else
			// #pragma omp parallel for simd collapse(2)
			// #endif
	#pragma omp parallel for simd collapse(2)		#pragma omp parallel for simd collapse(2)
	#endif
	for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?		for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
	for (int i3 = 1; i3 <= c1->nlem; i3++) {		for (int i3 = 1; i3 <= c1->nlem; i3++) {
	dcomplex sum1 = cc0;		dcomplex sum1 = cc0;
	@@ -2669,11 +2680,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
	sam_v[vecind1e + i3e - 1] = sum4;		sam_v[vecind1e + i3e - 1] = sum4;
	} // i3 loop		} // i3 loop
	} // i1 loop		} // i1 loop
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd collapse(2)		// #pragma omp target teams distribute parallel for simd collapse(2)
	#else		// #else
			// #pragma omp parallel for simd collapse(2)
			// #endif
	#pragma omp parallel for simd collapse(2)		#pragma omp parallel for simd collapse(2)
	#endif
	for (int i1 = 1; i1 <= c1->ndi; i1++) {		for (int i1 = 1; i1 <= c1->ndi; i1++) {
	for (int i0 = 1; i0 <= c1->nlem; i0++) {		for (int i0 = 1; i0 <= c1->nlem; i0++) {
	int vecindex = (i1 - 1) * c1->nlem + i0 - 1;		int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
	@@ -2682,11 +2694,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
	} // i0 loop		} // i0 loop
	} // i1 loop		} // i1 loop
	dcomplex *vec_am0m = c1->am0m[0];		dcomplex *vec_am0m = c1->am0m[0];
	#ifdef USE_TARGET_OFFLOAD		// #ifdef USE_TARGET_OFFLOAD
	#pragma omp target teams distribute parallel for simd collapse(2)		// #pragma omp target teams distribute parallel for simd collapse(2)
	#else		// #else
			// #pragma omp parallel for simd collapse(2)
			// #endif
	#pragma omp parallel for simd collapse(2)		#pragma omp parallel for simd collapse(2)
	#endif
	for (int i0 = 1; i0 <= c1->nlem; i0++) {		for (int i0 = 1; i0 <= c1->nlem; i0++) {
	for (int i3 = 1; i3 <= c1->nlemt; i3++) {		for (int i3 = 1; i3 <= c1->nlemt; i3++) {
	int i0e = i0 + c1->nlem;		int i0e = i0 + c1->nlem;

src/libnptm/tfrfme.cpp

+54 −23

Original line number	Original line	Diff line number	Diff line
	@@ -44,6 +44,10 @@
	#include "../include/file_io.h"		#include "../include/file_io.h"
	#endif		#endif

			#ifdef USE_TARGET_OFFLOAD
			#include <cstdlib>
			#endif

	using namespace std;		using namespace std;

	// >>> START OF Swap1 CLASS IMPLEMENTATION <<<		// >>> START OF Swap1 CLASS IMPLEMENTATION <<<
	@@ -220,18 +224,32 @@ bool Swap1::operator ==(Swap1 &other) {
	// >>> START OF Swap2 CLASS IMPLEMENTATION <<<		// >>> START OF Swap2 CLASS IMPLEMENTATION <<<
	Swap2::Swap2(int nkv) {		Swap2::Swap2(int nkv) {
	_nkv = nkv;		_nkv = nkv;
			#ifdef USE_TARGET_OFFLOAD
			vkv = (double )aligned_alloc(64, _nkv sizeof(double));
			vec_vkzm = (double )aligned_alloc(64, _nkv _nkv * sizeof(double));
			#pragma omp parallel for collapse(2)
			for (int i = 0; i < _nkv; i++) {
			for (int j = 0; j < _nkv; j++) {
			vkv[i] = 0.0;
			vec_vkzm[_nkv * i +j] = 0.0;
			}
			}
			#else
	vkv = new double[_nkv]();		vkv = new double[_nkv]();
	vec_vkzm = new double[_nkv * _nkv]();		vec_vkzm = new double[_nkv * _nkv]();
	vkzm = new double*[nkv];		#endif // USE TARGET_OFFLOAD
	for (int vi = 0; vi < _nkv; vi++) vkzm[vi] = vec_vkzm + vi * _nkv;
	_last_vector = 0;		_last_vector = 0;
	_last_matrix = 0;		_last_matrix = 0;
	}		}

	Swap2::~Swap2() {		Swap2::~Swap2() {
			#ifdef USE_TARGET_OFFLOAD
			free(vkv);
			free(vec_vkzm);
			#else
	delete[] vkv;		delete[] vkv;
	delete[] vec_vkzm;		delete[] vec_vkzm;
	delete[] vkzm;		#endif // USE_TARGET_OFFLOAD
	}		}

	Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) {		Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) {
	@@ -298,14 +316,14 @@ Swap2* Swap2::from_legacy(const std::string& file_name) {
	fstream input;		fstream input;
	Swap2 *instance = NULL;		Swap2 *instance = NULL;
	int fnkv, fnlmmt, fnrvc;		int fnkv, fnlmmt, fnrvc;
	double **fvkzm = NULL;		double *fvkzm = NULL;
	double *fvkv = NULL;		double *fvkv = NULL;
	double value;		double value;
	input.open(file_name.c_str(), ios::in \| ios::binary);		input.open(file_name.c_str(), ios::in \| ios::binary);
	if (input.is_open()) {		if (input.is_open()) {
	input.read(reinterpret_cast<char *>(&fnkv), sizeof(int));		input.read(reinterpret_cast<char *>(&fnkv), sizeof(int));
	instance = new Swap2(fnkv);		instance = new Swap2(fnkv);
	fvkzm = instance->get_matrix();		fvkzm = instance->vec_vkzm;
	fvkv = instance->get_vector();		fvkv = instance->get_vector();
	for (int vj = 0; vj < fnkv; vj++) {		for (int vj = 0; vj < fnkv; vj++) {
	input.read(reinterpret_cast<char *>(&value), sizeof(double));		input.read(reinterpret_cast<char *>(&value), sizeof(double));
	@@ -314,7 +332,7 @@ Swap2* Swap2::from_legacy(const std::string& file_name) {
	for (int mi = 0; mi < fnkv; mi++) {		for (int mi = 0; mi < fnkv; mi++) {
	for (int mj = 0; mj < fnkv; mj++) {		for (int mj = 0; mj < fnkv; mj++) {
	input.read(reinterpret_cast<char *>(&value), sizeof(double));		input.read(reinterpret_cast<char *>(&value), sizeof(double));
	fvkzm[mi][mj] = value;		fvkzm[fnkv * mi + mj] = value;
	}		}
	}		}
	input.read(reinterpret_cast<char *>(&value), sizeof(double));		input.read(reinterpret_cast<char *>(&value), sizeof(double));
	@@ -359,7 +377,7 @@ long Swap2::get_size(int nkv) {
	void Swap2::push_matrix(double value) {		void Swap2::push_matrix(double value) {
	int col = _last_matrix % (_nkv - 1);		int col = _last_matrix % (_nkv - 1);
	int row = _last_matrix - (_nkv * row);		int row = _last_matrix - (_nkv * row);
	vkzm[row][col] = value;		vec_vkzm[nkv * row + col] = value;
	_last_matrix++;		_last_matrix++;
	}		}

	@@ -480,7 +498,7 @@ void Swap2::write_legacy(const std::string& file_name) {
	}		}
	for (int mi = 0; mi < _nkv; mi++) {		for (int mi = 0; mi < _nkv; mi++) {
	for (int mj = 0; mj < _nkv; mj++) {		for (int mj = 0; mj < _nkv; mj++) {
	value = vkzm[mi][mj];		value = vec_vkzm[nkv * mi + mj];
	output.write(reinterpret_cast<const char*>(&value), sizeof(double));		output.write(reinterpret_cast<const char*>(&value), sizeof(double));
	}		}
	}		}
	@@ -552,8 +570,9 @@ bool Swap2::operator ==(Swap2 &other) {
	}		}
	}		}
	for (int mi = 0; mi < _nkv; mi++) {		for (int mi = 0; mi < _nkv; mi++) {
			int nkvi = nkv * mi;
	for (int mj = 0; mj < _nkv; mj++) {		for (int mj = 0; mj < _nkv; mj++) {
	if (vkzm[mi][mj] != other.vkzm[mi][mj]) {		if (vec_vkzm[nkvi + mj] != other.vec_vkzm[nkvi + mj]) {
	return false;		return false;
	}		}
	}		}
	@@ -580,22 +599,33 @@ TFRFME::TFRFME(int lmode, int lm, int nkv, int nxv, int nyv, int nzv) {
	_exril = 0.0;		_exril = 0.0;

	// Array initialization		// Array initialization
	xv = new double[nxv]();
	yv = new double[nyv]();
	zv = new double[nzv]();
	_nlmmt = _lm * (_lm + 2) * 2;		_nlmmt = _lm * (_lm + 2) * 2;
	_nrvc = _nxv * _nyv * _nzv;		_nrvc = _nxv * _nyv * _nzv;
	vec_wsum = new dcomplex[nrvc * nlmmt]();		#ifdef USE_TARGET_OFFLOAD
	wsum = new dcomplex*[nlmmt];		xv = (double )aligned_alloc(64, sizeof(double) _nxv);
	for (int wi = 0; wi < nlmmt; wi++) wsum[wi] = vec_wsum + wi * nrvc;		yv = (double )aligned_alloc(64, sizeof(double) _nyv);
			zv = (double )aligned_alloc(64, sizeof(double) _nzv);
			vec_wsum = (dcomplex )aligned_alloc(64, sizeof(dcomplex) _nrvc * _nlmmt);
			#else
			xv = new double[_nxv]();
			yv = new double[_nyv]();
			zv = new double[_nzv]();
			vec_wsum = new dcomplex[_nrvc * _nlmmt]();
			#endif // USE_TARGET_OFFLOAD
	}		}

	TFRFME::~TFRFME() {		TFRFME::~TFRFME() {
			#ifdef USE_TARGET_OFFLOAD
			free(xv);
			free(yv);
			free(zv);
			free(vec_wsum);
			#else
	delete[] xv;		delete[] xv;
	delete[] yv;		delete[] yv;
	delete[] zv;		delete[] zv;
	delete[] vec_wsum;		delete[] vec_wsum;
	delete[] wsum;		#endif
	}		}

	TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) {		TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) {
	@@ -662,7 +692,7 @@ TFRFME* TFRFME::from_hdf5(const std::string& file_name) {
	for (int wj = 0; wj < nrvc; wj++) {		for (int wj = 0; wj < nrvc; wj++) {
	for (int wi = 0; wi < nlmmt; wi++) {		for (int wi = 0; wi < nlmmt; wi++) {
	value = elements[index] + elements[index + 1] * I;		value = elements[index] + elements[index + 1] * I;
	instance->wsum[wi][wj] = value;		instance->vec_wsum[nrvc * wi + wj] = value;
	index += 2;		index += 2;
	} // wi loop		} // wi loop
	} // wj loop		} // wj loop
	@@ -727,7 +757,7 @@ TFRFME* TFRFME::from_legacy(const std::string& file_name) {
	input.read(reinterpret_cast<char *>(&rval), sizeof(double));		input.read(reinterpret_cast<char *>(&rval), sizeof(double));
	input.read(reinterpret_cast<char *>(&ival), sizeof(double));		input.read(reinterpret_cast<char *>(&ival), sizeof(double));
	dcomplex value = rval + ival * I;		dcomplex value = rval + ival * I;
	instance->wsum[wi][wj] = value;		instance->vec_wsum[nrvc * wi + wj] = value;
	} // wi loop		} // wi loop
	} // wj loop		} // wj loop
	input.close();		input.close();
	@@ -842,8 +872,8 @@ void TFRFME::write_hdf5(const std::string& file_name) {
	int index = 0;		int index = 0;
	for (int wj = 0; wj < nrvc; wj++) {		for (int wj = 0; wj < nrvc; wj++) {
	for (int wi = 0; wi < nlmmt; wi++) {		for (int wi = 0; wi < nlmmt; wi++) {
	ptr_elements[index++] = real(wsum[wi][wj]);		ptr_elements[index++] = real(vec_wsum[nrvc * wi + wj]);
	ptr_elements[index++] = imag(wsum[wi][wj]);		ptr_elements[index++] = imag(vec_wsum[nrvc * wi + wj]);
	} // wi loop		} // wi loop
	} // wj loop		} // wj loop
	rec_ptr_list.append(ptr_elements);		rec_ptr_list.append(ptr_elements);
	@@ -891,8 +921,8 @@ void TFRFME::write_legacy(const std::string& file_name) {
	output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double));		output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double));
	for (int wj = 0; wj < _nrvc; wj++) {		for (int wj = 0; wj < _nrvc; wj++) {
	for (int wi = 0; wi < _nlmmt; wi++) {		for (int wi = 0; wi < _nlmmt; wi++) {
	double rval = real(wsum[wi][wj]);		double rval = real(vec_wsum[nrvc * wi + wj]);
	double ival = imag(wsum[wi][wj]);		double ival = imag(vec_wsum[nrvc * wi + wj]);
	output.write(reinterpret_cast<char *>(&rval), sizeof(double));		output.write(reinterpret_cast<char *>(&rval), sizeof(double));
	output.write(reinterpret_cast<char *>(&ival), sizeof(double));		output.write(reinterpret_cast<char *>(&ival), sizeof(double));
	} // wi loop		} // wi loop
	@@ -962,8 +992,9 @@ bool TFRFME::operator ==(const TFRFME& other) {
	}		}
	}		}
	for (int wi = 0; wi < _nlmmt; wi++) {		for (int wi = 0; wi < _nlmmt; wi++) {
			int i = _nrvc * wi;
	for (int wj = 0; wj < _nrvc; wj++) {		for (int wj = 0; wj < _nrvc; wj++) {
	if (wsum[wi][wj] != other.wsum[wi][wj]) {		if (vec_wsum[i + wj] != other.vec_wsum[i + wj]) {
	return false;		return false;
	}		}
	} // wj loop		} // wj loop

Original line number	Original line	Diff line number	Diff line
	@@ -340,13 +340,15 @@ GeometryConfiguration* GeometryConfiguration::from_legacy(const std::string& fil
	// Read optional configuration data used only by the C++ code.		// Read optional configuration data used only by the C++ code.
	while (num_lines > last_read_line) {		while (num_lines > last_read_line) {
	str_target = file_lines[last_read_line++];		str_target = file_lines[last_read_line++];
	if (str_target.size() > 0) {		if (str_target.size() > 15) {
	if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) {		if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) {
	regex_search(str_target, m, re);		regex_search(str_target, m, re);
	short refine_flag = (short)stoi(m.str());		short refine_flag = (short)stoi(m.str());
	conf->_refine_flag = refine_flag;		conf->_refine_flag = refine_flag;
	}		}
	else if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) {		}
			if (str_target.size() > 14) {
			if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) {
	regex_search(str_target, m, re);		regex_search(str_target, m, re);
	short dyn_order_flag = (short)stoi(m.str());		short dyn_order_flag = (short)stoi(m.str());
	conf->_dyn_order_flag = dyn_order_flag;		conf->_dyn_order_flag = dyn_order_flag;