Comment out offload dependent code in clu_subs (44000bfd) · Commits · Giacomo Mulas / NP_TMcode

src/libnptm/clu_subs.cpp

+115 −98

Original line number	Diff line number	Diff line
		@@ -403,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) {
		return result;
		}

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		double cgev(int ipamo, int mu, int l, int m) {
		double result = 0.0;
		double xd = 0.0, xn = 0.0;
		@@ -439,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) {
		}
		return result;
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		void cms(dcomplex *am, ParticleDescriptor c1) {
		dcomplex dm, de, cgh, cgk;
		@@ -645,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) {
		delete[] svs;
		}

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		dcomplex ghit_d(
		int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
		ParticleDescriptor c1, double rac3j
		@@ -858,13 +858,13 @@ dcomplex ghit_d(
		}
		return result;
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		dcomplex ghit(
		int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
		ParticleDescriptor *c1
		@@ -1075,9 +1075,9 @@ dcomplex ghit(
		}
		return result;
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		void hjv(
		double exri, double vk, int &jer, int &lcalc, dcomplex &arg,
		@@ -1335,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) {
		#ifdef USE_NVTX
		nvtxRangePush("pcros intermediate loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		// #else
		// #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		// #endif
		#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
		#endif
		for (int i12 = 0; i12 < nlemt; i12++) {
		// int i = i12 - 1;
		dcomplex am = cc0;
		@@ -1404,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
		csam = -(ccs / (exri * vk)) * 0.5 * I;
		sum2 = cc0;
		sum3 = cc0;
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
		// #else
		// #pragma omp parallel for simd reduction(+:sum2,sum3)
		// #endif
		#pragma omp parallel for simd reduction(+:sum2,sum3)
		#endif
		for (int i14 = 0; i14 < c1->nlem; i14++) {
		int ie = i14 + c1->nlem;
		sum2 += (vec_am0m[nlemti14 + i14] + vec_am0m[nlemtie + ie]);
		@@ -1416,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
		} // i14 loop
		double sumpi = 0.0;
		dcomplex sumpd = cc0;
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		// #else
		// #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		// #endif
		#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
		#endif
		for (int i16 = 0; i16 < nlemt; i16++) {
		for (int j16 = 0; j16 < c1->nlem; j16++) {
		int je = j16 + c1->nlem;
		@@ -1624,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) {
		}
		}

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
		int jmx = j3 + j2;
		int jdf = j3 - j2;
		@@ -1744,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
		}
		}
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		#ifdef USE_TARGET_OFFLOAD
		#pragma omp begin declare target device_type(any)
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp begin declare target device_type(any)
		// #endif
		void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
		int jmx = j3 + j2;
		int jdf = j3 - j2;
		@@ -1868,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
		}
		}
		}
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp end declare target
		#endif
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp end declare target
		// #endif

		void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) {
		int mmx = (j2 < j3 - m1) ? j2 : j3 - m1;
		@@ -2001,11 +2004,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba inner loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
		// #else
		// #pragma omp parallel for simd reduction(+:c1, c2)
		// #endif
		#pragma omp parallel for simd reduction(+:c1, c2)
		#endif
		for (int j10 = 1; j10 <= nlemt; j10++) {
		int j = j10 - 1;
		c1 += (vec_am0m[inlemt+j] vec_w[4*j]);
		@@ -2023,11 +2027,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba outer loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp teams distribute parallel for
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp teams distribute parallel for
		// #else
		// #pragma omp parallel for
		// #endif
		#pragma omp parallel for
		#endif
		for (int ipo = 0; ipo < 2; ipo++) {
		int jpo = 1 - ipo;
		ctqce[ipo][0] = cc0;
		@@ -2059,11 +2064,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba inner loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		// #else
		// #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		// #endif
		#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
		#endif
		for (int k = 1; k<=kmax; k++) {
		int l60 = (int) sqrt(k+1);
		int im60 = k - (l60*l60) + 1;
		@@ -2136,11 +2142,12 @@ void raba(
		#ifdef USE_NVTX
		nvtxRangePush("raba loop 3");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd
		// #else
		// #pragma omp parallel for simd
		// #endif
		#pragma omp parallel for simd
		#endif
		for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
		int ipo = ipo78 - 1;
		tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i;
		@@ -2210,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
		#ifdef USE_NVTX
		nvtxRangePush("scr0 inner loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
		// #else
		// #pragma omp parallel for simd reduction(+:sums, sum21)
		// #endif
		#pragma omp parallel for simd reduction(+:sums, sum21)
		#endif
		for (int l10 = 1; l10 <= c1->li; l10++) {
		double fl = 1.0 * (l10 + l10 + 1);
		// dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1];
		@@ -2258,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
		#ifdef USE_NVTX
		nvtxRangePush("scr0 loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
		// #else
		// #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
		// #endif
		#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
		#endif
		for (int i14 = 1; i14 <= c1->nsph; i14++) {
		int iogi = c1->iog[i14 - 1];
		scs += c1->sscs[iogi - 1];
		@@ -2324,11 +2333,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 inner loop 1");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
		// #else
		// #pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
		// #endif
		#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
		#endif
		for (int k = 1; k<=kmax; k++) {
		int l10 = (int) sqrt(k+1);
		int im10 = k - (l10*l10) + 1;
		@@ -2380,11 +2390,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 loop 2");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		// #else
		// #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		// #endif
		#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
		#endif
		for (int i14 = 1; i14 <= c1->nsph; i14++) {
		int i = i14 - 1;
		int iogi = c1->iog[i14 - 1];
		@@ -2414,11 +2425,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 inner loop 3");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(4)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(4)
		// #else
		// #pragma omp parallel for simd collapse(4)
		// #endif
		#pragma omp parallel for simd collapse(4)
		#endif
		for (int ipo1 = 1; ipo1 <=2; ipo1++) {
		for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
		for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
		@@ -2441,11 +2453,12 @@ void scr2(
		#ifdef USE_NVTX
		nvtxRangePush("scr2 loop 4");
		#endif
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for collapse(4)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for collapse(4)
		// #else
		// #pragma omp parallel for collapse(4)
		// #endif
		#pragma omp parallel for collapse(4)
		#endif
		for (int ipo1 = 1; ipo1 <=2; ipo1++) {
		for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
		for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
		@@ -2578,11 +2591,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		// but if it results im = 0, then we set l = l-1 and im = 2*l+1
		// furthermore if it results im > 2*l+1, then we set
		// im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(3)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(3)
		// #else
		// #pragma omp parallel for simd collapse(3)
		// #endif
		#pragma omp parallel for simd collapse(3)
		#endif
		for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
		for (int k2 = 1; k2<=k2max; k2++) {
		for (int k3 = 1; k3<=k3max; k3++) {
		@@ -2628,11 +2642,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		#endif
		dcomplex *am_v = am[0];
		dcomplex *sam_v = c1->sam[0];
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2)
		// #else
		// #pragma omp parallel for simd collapse(2)
		// #endif
		#pragma omp parallel for simd collapse(2)
		#endif
		for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
		for (int i3 = 1; i3 <= c1->nlem; i3++) {
		dcomplex sum1 = cc0;
		@@ -2665,11 +2680,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		sam_v[vecind1e + i3e - 1] = sum4;
		} // i3 loop
		} // i1 loop
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2)
		// #else
		// #pragma omp parallel for simd collapse(2)
		// #endif
		#pragma omp parallel for simd collapse(2)
		#endif
		for (int i1 = 1; i1 <= c1->ndi; i1++) {
		for (int i0 = 1; i0 <= c1->nlem; i0++) {
		int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
		@@ -2678,11 +2694,12 @@ void ztm(dcomplex *am, ParticleDescriptor c1) {
		} // i0 loop
		} // i1 loop
		dcomplex *vec_am0m = c1->am0m[0];
		#ifdef USE_TARGET_OFFLOAD
		#pragma omp target teams distribute parallel for simd collapse(2)
		#else
		// #ifdef USE_TARGET_OFFLOAD
		// #pragma omp target teams distribute parallel for simd collapse(2)
		// #else
		// #pragma omp parallel for simd collapse(2)
		// #endif
		#pragma omp parallel for simd collapse(2)
		#endif
		for (int i0 = 1; i0 <= c1->nlem; i0++) {
		for (int i3 = 1; i3 <= c1->nlemt; i3++) {
		int i0e = i0 + c1->nlem;