Loading src/libnptm/clu_subs.cpp +19 −10 Original line number Original line Diff line number Diff line Loading @@ -2534,7 +2534,7 @@ void tqr( } } void ztm(dcomplex **am, ParticleDescriptor *c1) { void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I; // int i2 = 0; // old implementation // int i2 = 0; // old implementation #ifdef USE_NVTX #ifdef USE_NVTX Loading @@ -2546,7 +2546,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // C9 *c9_para = new C9(*c9); // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c1->gis[0]; dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; dcomplex *gls_v = c1->gls[0]; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); int k2max = c1->li*(c1->li+2); int k2max = c1->li*(c1->li+2); int k3max = c1->le*(c1->le+2); int k3max = c1->le*(c1->le+2); // To parallelise, I run a linearised loop directly over k // To parallelise, I run a linearised loop directly over k Loading @@ -2559,6 +2558,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #pragma omp target teams distribute parallel for simd collapse(3) #else #pragma omp parallel for simd collapse(3) #endif #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k2 = 1; k2<=k2max; k2++) { Loading Loading @@ -2590,12 +2591,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i3 = l3 * l3 + im3 - 1; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); free(rac3j_local); } // close k3 loop, former l3 + im3 loops } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop } // close n2 loop free(rac3j_local); #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif #endif Loading @@ -2606,6 +2608,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex *sam_v = c1->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { for (int i3 = 1; i3 <= c1->nlem; i3++) { Loading @@ -2615,7 +2619,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex sum4 = cc0; dcomplex sum4 = cc0; int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int i3e = i3 + c1->nlem; int i3e = i3 + c1->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= c1->ndi; i2++) { for (int i2 = 1; i2 <= c1->ndi; i2++) { int i2e = i2 + c1->ndi; int i2e = i2 + c1->ndi; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; Loading @@ -2640,7 +2643,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i3 loop } // i1 loop } // i1 loop #pragma omp parallel for collapse(2) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2650,7 +2657,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i1 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { Loading @@ -2661,11 +2670,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; a1 = sam_v[vecind1 + i3 - 1]; dcomplex a1 = sam_v[vecind1 + i3 - 1]; a2 = sam_v[vecind1e + i3 - 1]; dcomplex a2 = sam_v[vecind1e + i3 - 1]; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gie = gis_v[vecindex]; dcomplex gie = gis_v[vecindex]; gle = gls_v[vecindex]; dcomplex gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); sum2 += (a1 * gle + a2 * gie); } // i1 loop } // i1 loop Loading Loading
src/libnptm/clu_subs.cpp +19 −10 Original line number Original line Diff line number Diff line Loading @@ -2534,7 +2534,7 @@ void tqr( } } void ztm(dcomplex **am, ParticleDescriptor *c1) { void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I; // int i2 = 0; // old implementation // int i2 = 0; // old implementation #ifdef USE_NVTX #ifdef USE_NVTX Loading @@ -2546,7 +2546,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // C9 *c9_para = new C9(*c9); // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c1->gis[0]; dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; dcomplex *gls_v = c1->gls[0]; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); int k2max = c1->li*(c1->li+2); int k2max = c1->li*(c1->li+2); int k3max = c1->le*(c1->le+2); int k3max = c1->le*(c1->le+2); // To parallelise, I run a linearised loop directly over k // To parallelise, I run a linearised loop directly over k Loading @@ -2559,6 +2558,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #pragma omp target teams distribute parallel for simd collapse(3) #else #pragma omp parallel for simd collapse(3) #endif #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k2 = 1; k2<=k2max; k2++) { Loading Loading @@ -2590,12 +2591,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i3 = l3 * l3 + im3 - 1; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); free(rac3j_local); } // close k3 loop, former l3 + im3 loops } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop } // close n2 loop free(rac3j_local); #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif #endif Loading @@ -2606,6 +2608,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex *sam_v = c1->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { for (int i3 = 1; i3 <= c1->nlem; i3++) { Loading @@ -2615,7 +2619,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex sum4 = cc0; dcomplex sum4 = cc0; int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int i3e = i3 + c1->nlem; int i3e = i3 + c1->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= c1->ndi; i2++) { for (int i2 = 1; i2 <= c1->ndi; i2++) { int i2e = i2 + c1->ndi; int i2e = i2 + c1->ndi; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; Loading @@ -2640,7 +2643,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i3 loop } // i1 loop } // i1 loop #pragma omp parallel for collapse(2) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2650,7 +2657,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i1 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { Loading @@ -2661,11 +2670,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; a1 = sam_v[vecind1 + i3 - 1]; dcomplex a1 = sam_v[vecind1 + i3 - 1]; a2 = sam_v[vecind1e + i3 - 1]; dcomplex a2 = sam_v[vecind1e + i3 - 1]; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gie = gis_v[vecindex]; dcomplex gie = gis_v[vecindex]; gle = gls_v[vecindex]; dcomplex gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); sum2 += (a1 * gle + a2 * gie); } // i1 loop } // i1 loop Loading