Loading src/libnptm/clu_subs.cpp +50 −45 Original line number Diff line number Diff line Loading @@ -2259,10 +2259,6 @@ void scr2( vec_sas[vecindex+2] = s21 * csam; vec_sas[vecindex+1] = s12 * csam; vec_sas[vecindex+3] = s22 * csam; // c1->sas[i][0][0] = s11 * csam; // c1->sas[i][1][0] = s21 * csam; // c1->sas[i][0][1] = s12 * csam; // c1->sas[i][1][1] = s22 * csam; } // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); Loading @@ -2270,10 +2266,6 @@ void scr2( tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); // c3->tsas[0][0] += (c1->sas[iogi - 1][0][0] * phas); // c3->tsas[1][0] += (c1->sas[iogi - 1][1][0] * phas); // c3->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas); // c3->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); Loading @@ -2297,11 +2289,9 @@ void scr2( #pragma omp target teams distribute parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { // cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { int j = jpo2 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; // j++; vec_vints[(i24 - 1)*16+j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop } // ipo2 loop Loading @@ -2322,10 +2312,8 @@ void scr2( #pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { // cc = dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { // j++; int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; c1ao->vintt[j] = c3->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop Loading Loading @@ -2443,38 +2431,54 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c9->gis[0]; dcomplex *gls_v = c9->gls[0]; #pragma omp target parallel { double *rac3j_local = (double *) malloc(c6->lmtpo*sizeof(double)); #pragma omp for collapse(3) int k2max = c4->li*(c4->li+2); int k3max = c4->le*(c4->le+2); // To parallelise, I run a linearised loop directly over k // working out the algebra, it turns out that // k = l*l-1+im // we invert this to find // l = (int) sqrt(k+1) and im = k - l*l+1 // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) # pragma omp target teams distribute parallel for simd collapse(3) for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable? for (int l2 = 1; l2 <= c4->li; l2++) { for (int l3 = 1; l3 <= c4->le; l3++) { int l2tpo = l2 + l2 + 1; int l3tpo = l3 + l3 + 1; for (int im2 = 1; im2 <= l2tpo; im2++) { for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { int l2 = (int) sqrt(k2+1); int im2 = k2 - (l2*l2) + 1; if (im2 == 0) { l2--; im2 = 2*l2+1; } else if (im2 > 2*l2 + 1) { im2 -= 2*l2 + 1; l2++; } int l3 = (int) sqrt(k3+1); int im3 = k3 - (l3*l3) + 1; if (im3 == 0) { l3--; im3 = 2*l3+1; } else if (im3 > 2*l3 + 1) { im3 -= 2*l3 + 1; l3++; } // int l2tpo = l2 + l2 + 1; // int l3tpo = l3 + l3 + 1; int i2 = (n2-1) * c4->li * (c4->li + 2) + l2 * l2 + im2 - 1; // int vecindex0 = (i2 - 1)*c9->nlem; int m2 = -l2 - 1 + im2; // i2++; // old implementation // int i3 = 0; // old implementation //#pragma omp for for (int im3 = 1; im3 <= l3tpo; im3++) { int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1)*c9->nlem + i3 - 1; // i3++; // old implementation gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); // c9->gis[i2 - 1][i3 - 1] = ghit(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); // c9->gls[i2 - 1][i3 - 1] = ghit(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); } // im3 loop } // l3 loop } // im2 loop } // l2 loop } // n2 loop } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop free(rac3j_local); } // pragma omp parallel #ifdef USE_NVTX nvtxRangePop(); #endif Loading @@ -2483,7 +2487,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c9->sam[0]; # pragma omp target teams distribute parallel for collapse(2) # pragma omp target teams distribute parallel for simd collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c4->nlem; i3++) { dcomplex sum1 = cc0; Loading @@ -2492,6 +2496,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { dcomplex sum4 = cc0; int i1e = i1 + ndi; int i3e = i3 + c4->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= ndi; i2++) { int i2e = i2 + ndi; int vecindg_23 = (i2 - 1)*c9->nlem + i3 - 1; Loading Loading
src/libnptm/clu_subs.cpp +50 −45 Original line number Diff line number Diff line Loading @@ -2259,10 +2259,6 @@ void scr2( vec_sas[vecindex+2] = s21 * csam; vec_sas[vecindex+1] = s12 * csam; vec_sas[vecindex+3] = s22 * csam; // c1->sas[i][0][0] = s11 * csam; // c1->sas[i][1][0] = s21 * csam; // c1->sas[i][0][1] = s12 * csam; // c1->sas[i][1][1] = s22 * csam; } // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); Loading @@ -2270,10 +2266,6 @@ void scr2( tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); // c3->tsas[0][0] += (c1->sas[iogi - 1][0][0] * phas); // c3->tsas[1][0] += (c1->sas[iogi - 1][1][0] * phas); // c3->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas); // c3->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); Loading @@ -2297,11 +2289,9 @@ void scr2( #pragma omp target teams distribute parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { // cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { int j = jpo2 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; // j++; vec_vints[(i24 - 1)*16+j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop } // ipo2 loop Loading @@ -2322,10 +2312,8 @@ void scr2( #pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { // cc = dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { // j++; int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; c1ao->vintt[j] = c3->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop Loading Loading @@ -2443,38 +2431,54 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c9->gis[0]; dcomplex *gls_v = c9->gls[0]; #pragma omp target parallel { double *rac3j_local = (double *) malloc(c6->lmtpo*sizeof(double)); #pragma omp for collapse(3) int k2max = c4->li*(c4->li+2); int k3max = c4->le*(c4->le+2); // To parallelise, I run a linearised loop directly over k // working out the algebra, it turns out that // k = l*l-1+im // we invert this to find // l = (int) sqrt(k+1) and im = k - l*l+1 // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) # pragma omp target teams distribute parallel for simd collapse(3) for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable? for (int l2 = 1; l2 <= c4->li; l2++) { for (int l3 = 1; l3 <= c4->le; l3++) { int l2tpo = l2 + l2 + 1; int l3tpo = l3 + l3 + 1; for (int im2 = 1; im2 <= l2tpo; im2++) { for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { int l2 = (int) sqrt(k2+1); int im2 = k2 - (l2*l2) + 1; if (im2 == 0) { l2--; im2 = 2*l2+1; } else if (im2 > 2*l2 + 1) { im2 -= 2*l2 + 1; l2++; } int l3 = (int) sqrt(k3+1); int im3 = k3 - (l3*l3) + 1; if (im3 == 0) { l3--; im3 = 2*l3+1; } else if (im3 > 2*l3 + 1) { im3 -= 2*l3 + 1; l3++; } // int l2tpo = l2 + l2 + 1; // int l3tpo = l3 + l3 + 1; int i2 = (n2-1) * c4->li * (c4->li + 2) + l2 * l2 + im2 - 1; // int vecindex0 = (i2 - 1)*c9->nlem; int m2 = -l2 - 1 + im2; // i2++; // old implementation // int i3 = 0; // old implementation //#pragma omp for for (int im3 = 1; im3 <= l3tpo; im3++) { int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1)*c9->nlem + i3 - 1; // i3++; // old implementation gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); // c9->gis[i2 - 1][i3 - 1] = ghit(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); // c9->gls[i2 - 1][i3 - 1] = ghit(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); } // im3 loop } // l3 loop } // im2 loop } // l2 loop } // n2 loop } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop free(rac3j_local); } // pragma omp parallel #ifdef USE_NVTX nvtxRangePop(); #endif Loading @@ -2483,7 +2487,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c9->sam[0]; # pragma omp target teams distribute parallel for collapse(2) # pragma omp target teams distribute parallel for simd collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c4->nlem; i3++) { dcomplex sum1 = cc0; Loading @@ -2492,6 +2496,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { dcomplex sum4 = cc0; int i1e = i1 + ndi; int i3e = i3 + c4->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= ndi; i2++) { int i2e = i2 + ndi; int vecindg_23 = (i2 - 1)*c9->nlem + i3 - 1; Loading