Loading src/libnptm/clu_subs.cpp +5 −34 Original line number Diff line number Diff line Loading @@ -2243,17 +2243,13 @@ void tqr( } void ztm(dcomplex **am, ParticleDescriptor *c1) { // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; // int i2 = 0; // old implementation double *rac3j = new double[c1->lmtpo]; #ifdef USE_NVTX nvtxRangePush("ZTM starts"); #endif #ifdef USE_NVTX nvtxRangePush("ZTM parallel loop 1"); #endif // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; int k2max = c1->li*(c1->li+2); Loading @@ -2266,15 +2262,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(3) // #else // #pragma omp parallel for simd collapse(3) // #endif // #pragma omp parallel for simd collapse(3) #pragma omp parallel for simd collapse(3) for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { double rac3j[c1->lmtpo]; int l2 = (int) sqrt(k2+1); int im2 = k2 - (l2*l2) + 1; if (im2 == 0) { Loading @@ -2295,23 +2287,17 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { im3 -= 2*l3 + 1; l3++; } // int l2tpo = l2 + l2 + 1; // int l3tpo = l3 + l3 + 1; int i2 = (n2-1) * c1->li * (c1->li + 2) + l2 * l2 + im2 - 1; int m2 = -l2 - 1 + im2; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; // double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); // gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); // gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); // free(rac3j_local); gis_v[vecindex] = ghit(2, 0, n2, l2, m2, l3, m3, c1, rac3j); gls_v[vecindex] = ghit(2, 1, n2, l2, m2, l3, m3, c1, rac3j); } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop delete[] rac3j; // delete[] rac3j; #ifdef USE_NVTX nvtxRangePop(); #endif Loading @@ -2320,11 +2306,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { Loading Loading @@ -2358,11 +2339,7 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { Loading @@ -2371,12 +2348,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { gls_v[vecindex] = dconjg(gls_v[vecindex]); } // i0 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { Loading @@ -2399,8 +2372,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int vecind0e = (i0e - 1) * c1->nlemt; vec_am0m[vecind0 + i3 - 1] = -sum1; vec_am0m[vecind0e + i3 - 1] = -sum2; // c1->am0m[i0 - 1][i3 - 1] = -sum1; // c1->am0m[i0e - 1][i3 - 1] = -sum2; } // i3 loop } // i0 loop #ifdef USE_NVTX Loading Loading
src/libnptm/clu_subs.cpp +5 −34 Original line number Diff line number Diff line Loading @@ -2243,17 +2243,13 @@ void tqr( } void ztm(dcomplex **am, ParticleDescriptor *c1) { // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; // int i2 = 0; // old implementation double *rac3j = new double[c1->lmtpo]; #ifdef USE_NVTX nvtxRangePush("ZTM starts"); #endif #ifdef USE_NVTX nvtxRangePush("ZTM parallel loop 1"); #endif // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; int k2max = c1->li*(c1->li+2); Loading @@ -2266,15 +2262,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(3) // #else // #pragma omp parallel for simd collapse(3) // #endif // #pragma omp parallel for simd collapse(3) #pragma omp parallel for simd collapse(3) for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { double rac3j[c1->lmtpo]; int l2 = (int) sqrt(k2+1); int im2 = k2 - (l2*l2) + 1; if (im2 == 0) { Loading @@ -2295,23 +2287,17 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { im3 -= 2*l3 + 1; l3++; } // int l2tpo = l2 + l2 + 1; // int l3tpo = l3 + l3 + 1; int i2 = (n2-1) * c1->li * (c1->li + 2) + l2 * l2 + im2 - 1; int m2 = -l2 - 1 + im2; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; // double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); // gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); // gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); // free(rac3j_local); gis_v[vecindex] = ghit(2, 0, n2, l2, m2, l3, m3, c1, rac3j); gls_v[vecindex] = ghit(2, 1, n2, l2, m2, l3, m3, c1, rac3j); } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop delete[] rac3j; // delete[] rac3j; #ifdef USE_NVTX nvtxRangePop(); #endif Loading @@ -2320,11 +2306,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { Loading Loading @@ -2358,11 +2339,7 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { Loading @@ -2371,12 +2348,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { gls_v[vecindex] = dconjg(gls_v[vecindex]); } // i0 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { Loading @@ -2399,8 +2372,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int vecind0e = (i0e - 1) * c1->nlemt; vec_am0m[vecind0 + i3 - 1] = -sum1; vec_am0m[vecind0e + i3 - 1] = -sum2; // c1->am0m[i0 - 1][i3 - 1] = -sum1; // c1->am0m[i0e - 1][i3 - 1] = -sum2; } // i3 loop } // i0 loop #ifdef USE_NVTX Loading