Loading src/libnptm/clu_subs.cpp +115 −98 Original line number Diff line number Diff line Loading @@ -403,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double xd = 0.0, xn = 0.0; Loading Loading @@ -439,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) { } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void cms(dcomplex **am, ParticleDescriptor *c1) { dcomplex dm, de, cgh, cgk; Loading Loading @@ -645,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j Loading Loading @@ -858,13 +858,13 @@ dcomplex ghit_d( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 Loading Loading @@ -1075,9 +1075,9 @@ dcomplex ghit( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -1335,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("pcros intermediate loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #else // #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #endif #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; dcomplex am = cc0; Loading Loading @@ -1404,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) // #else // #pragma omp parallel for simd reduction(+:sum2,sum3) // #endif #pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); Loading @@ -1416,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { } // i14 loop double sumpi = 0.0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #else // #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #endif #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { int je = j16 + c1->nlem; Loading Loading @@ -1624,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) { } } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1744,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1868,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; Loading Loading @@ -2001,11 +2004,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) // #else // #pragma omp parallel for simd reduction(+:c1, c2) // #endif #pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); Loading @@ -2023,11 +2027,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba outer loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp teams distribute parallel for // #else // #pragma omp parallel for // #endif #pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; ctqce[ipo][0] = cc0; Loading Loading @@ -2059,11 +2064,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #else // #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #endif #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; Loading Loading @@ -2136,11 +2142,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #else // #pragma omp parallel for simd // #endif #pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; Loading Loading @@ -2210,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) // #else // #pragma omp parallel for simd reduction(+:sums, sum21) // #endif #pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; Loading Loading @@ -2258,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) // #else // #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) // #endif #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; Loading Loading @@ -2324,11 +2333,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; Loading Loading @@ -2380,11 +2390,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2414,11 +2425,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(4) // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading @@ -2441,11 +2453,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for collapse(4) // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading Loading @@ -2578,11 +2591,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(3) // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2628,11 +2642,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; Loading Loading @@ -2665,11 +2680,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2678,11 +2694,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i0 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; Loading Loading
src/libnptm/clu_subs.cpp +115 −98 Original line number Diff line number Diff line Loading @@ -403,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double xd = 0.0, xn = 0.0; Loading Loading @@ -439,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) { } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void cms(dcomplex **am, ParticleDescriptor *c1) { dcomplex dm, de, cgh, cgk; Loading Loading @@ -645,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j Loading Loading @@ -858,13 +858,13 @@ dcomplex ghit_d( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 Loading Loading @@ -1075,9 +1075,9 @@ dcomplex ghit( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -1335,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("pcros intermediate loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #else // #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #endif #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; dcomplex am = cc0; Loading Loading @@ -1404,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) // #else // #pragma omp parallel for simd reduction(+:sum2,sum3) // #endif #pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); Loading @@ -1416,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { } // i14 loop double sumpi = 0.0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #else // #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #endif #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { int je = j16 + c1->nlem; Loading Loading @@ -1624,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) { } } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1744,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1868,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; Loading Loading @@ -2001,11 +2004,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) // #else // #pragma omp parallel for simd reduction(+:c1, c2) // #endif #pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); Loading @@ -2023,11 +2027,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba outer loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp teams distribute parallel for // #else // #pragma omp parallel for // #endif #pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; ctqce[ipo][0] = cc0; Loading Loading @@ -2059,11 +2064,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #else // #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #endif #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; Loading Loading @@ -2136,11 +2142,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #else // #pragma omp parallel for simd // #endif #pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; Loading Loading @@ -2210,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) // #else // #pragma omp parallel for simd reduction(+:sums, sum21) // #endif #pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; Loading Loading @@ -2258,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) // #else // #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) // #endif #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; Loading Loading @@ -2324,11 +2333,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; Loading Loading @@ -2380,11 +2390,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2414,11 +2425,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(4) // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading @@ -2441,11 +2453,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for collapse(4) // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading Loading @@ -2578,11 +2591,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(3) // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2628,11 +2642,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; Loading Loading @@ -2665,11 +2680,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2678,11 +2694,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i0 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; Loading