Loading src/libnptm/clu_subs.cpp +40 −146 Original line number Original line Diff line number Diff line Loading @@ -2288,9 +2288,6 @@ void scr2( double vk, double vkarg, double exri, double *duk, double vk, double vkarg, double exri, double *duk, ParticleDescriptor *c1 ParticleDescriptor *c1 ) { ) { #ifdef USE_NVTX nvtxRangePush("scr2 starts"); #endif const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex uim = 0.0 + 1.0 * I; const dcomplex uim = 0.0 + 1.0 * I; dcomplex s11, s21, s12, s22, rm, re, csam, cph, phas, cc; dcomplex s11, s21, s12, s22, rm, re, csam, cph, phas, cc; Loading @@ -2300,177 +2297,74 @@ void scr2( double cfsq = 4.0 / (pi4sq * ccs * ccs); double cfsq = 4.0 / (pi4sq * ccs * ccs); cph = uim * exri * vkarg; cph = uim * exri * vkarg; int ls = (c1->li < c1->le) ? c1->li : c1->le; int ls = (c1->li < c1->le) ? c1->li : c1->le; int kmax = ls*(ls+2); c1->tsas[0][0] = cc0; dcomplex *vec_rmi = c1->rmi[0]; c1->tsas[1][0] = cc0; dcomplex *vec_rei = c1->rei[0]; c1->tsas[0][1] = cc0; dcomplex *vec_w = c1->w[0]; c1->tsas[1][1] = cc0; dcomplex *vec_sas = c1->sas[0][0]; #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 1"); #endif #pragma omp parallel for for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int i = i14 - 1; int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; if (iogi >= i14) { if (iogi >= i14) { // int k = 0; int k = 0; dcomplex s11 = cc0; s11 = cc0; dcomplex s21 = cc0; s21 = cc0; dcomplex s12 = cc0; s12 = cc0; dcomplex s22 = cc0; s22 = cc0; // To parallelise, I run a linearised loop directly over k for (int l10 = 1; l10 <= ls; l10++) { // working out the algebra, it turns out that // k = l10*l10-1+im10 // we invert this to find // l10 = (int) sqrt(k+1) and im10 = k - l10*10+1 // but if it results im10 = 0, then we set l10 = l10-1 and im10 = 2*l10+1 // furthermore if it results im10 > 2*l10+1, then we set // im10 = im10 -(2*l10+1) and l10 = l10+1 (there was a rounding error in a nearly exact root) #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; if (im10 == 0) { l10--; im10 = 2*l10+1; } else if (im10 > 2*l10 + 1) { im10 -= 2*l10 + 1; l10++; } // I have all the indices in my linearised loop int l = l10 - 1; int l = l10 - 1; rm = 1.0 / c1->rmi[l][i]; re = 1.0 / c1->rei[l][i]; int ltpo = l10 + l10 + 1; for (int im10 = 1; im10 <= ltpo; im10++) { k++; int ke = k + c1->nlem; int ke = k + c1->nlem; int km1t4 = (k - 1)*4; s11 -= (c1->w[k - 1][2] * c1->w[k - 1][0] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][0] * re); int kem1t4 = (ke - 1)*4; s21 -= (c1->w[k - 1][3] * c1->w[k - 1][0] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][0] * re); dcomplex auxrm0 = vec_w[km1t4] / vec_rmi[l*c1->nsph+i]; s12 -= (c1->w[k - 1][2] * c1->w[k - 1][1] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][1] * re); dcomplex auxrm1 = vec_w[km1t4+1] / vec_rmi[l*c1->nsph+i]; s22 -= (c1->w[k - 1][3] * c1->w[k - 1][1] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][1] * re); dcomplex auxre0 = vec_w[kem1t4] / vec_rei[l*c1->nsph+i]; } // im10 loop dcomplex auxre1 = vec_w[kem1t4+1] / vec_rei[l*c1->nsph+i]; } // l10 loop s11 -= vec_w[km1t4+2] * auxrm0 + vec_w[kem1t4+2] * auxre0; c1->sas[i][0][0] = s11 * csam; s21 -= vec_w[km1t4+3] * auxrm0 + vec_w[kem1t4+3] * auxre0; c1->sas[i][1][0] = s21 * csam; s12 -= vec_w[km1t4+2] * auxrm1 + vec_w[kem1t4+2] * auxre1; c1->sas[i][0][1] = s12 * csam; s22 -= vec_w[km1t4+3] * auxrm1 + vec_w[kem1t4+3] * auxre1; c1->sas[i][1][1] = s22 * csam; } #ifdef USE_NVTX nvtxRangePop(); #endif int vecindex = i*4; vec_sas[vecindex] = s11 * csam; vec_sas[vecindex+2] = s21 * csam; vec_sas[vecindex+1] = s12 * csam; vec_sas[vecindex+3] = s22 * csam; } } // label 12 // label 12 // dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); // tsas00 += (c1->sas[iogi - 1][0][0] * phas); c1->tsas[0][0] += (c1->sas[iogi - 1][0][0] * phas); // tsas10 += (c1->sas[iogi - 1][1][0] * phas); c1->tsas[1][0] += (c1->sas[iogi - 1][1][0] * phas); // tsas01 += (c1->sas[iogi - 1][0][1] * phas); c1->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas); // tsas11 += (c1->sas[iogi - 1][1][1] * phas); c1->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); #endif dcomplex tsas00 = cc0; dcomplex tsas10 = cc0; dcomplex tsas01 = cc0; dcomplex tsas11 = cc0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); tsas00 += (c1->sas[iogi - 1][0][0] * phas); tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop } // i14 loop c1->tsas[0][0] = tsas00; c1->tsas[1][0] = tsas10; c1->tsas[0][1] = tsas01; c1->tsas[1][1] = tsas11; #ifdef USE_NVTX nvtxRangePop(); //#endif //dcomplex *vec_vints = c1->vints[0]; //#ifdef USE_NVTX nvtxRangePush("scr2 outer loop 3"); #endif #pragma omp parallel for for (int i24 = 1; i24 <= c1->nsph; i24++) { for (int i24 = 1; i24 <= c1->nsph; i24++) { int iogi = c1->iog[i24 - 1]; int iogi = c1->iog[i24 - 1]; if (iogi >= i24) { if (iogi >= i24) { // int j = 0; int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(4) // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { int j = jpo2 - 1 + (ipo2 - 1) * 2 + (jpo1 - 1) * 4 + (ipo1 - 1) * 8; j++; c1->vints[i24 - 1][j] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq; c1->vints[i24 - 1][j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * cc * cfsq; } // jpo2 loop } // jpo2 loop } // ipo2 loop } // ipo2 loop } // jpo1 loop } // jpo1 loop } // ipo1 loop } // ipo1 loop #ifdef USE_NVTX nvtxRangePop(); #endif } } } // i24 loop } // i24 loop #ifdef USE_NVTX int j = 0; nvtxRangePop(); #endif // int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for collapse(4) // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { cc = dconjg(c1->tsas[jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; j++; c1->vintt[j] = c1->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c1->tsas[jpo1 - 1][ipo1 - 1]) * cfsq; c1->vintt[j - 1] = c1->tsas[jpo2 - 1][ipo2 - 1] * cc * cfsq; } // jpo2 loop } // jpo2 loop } // ipo2 loop } // ipo2 loop } // jpo1 loop } // jpo1 loop } // ipo1 loop } // ipo1 loop #ifdef USE_NVTX nvtxRangePop(); #endif #ifdef USE_NVTX nvtxRangePop(); #endif } } void str(double **rcf, ParticleDescriptor *c1) { void str(double **rcf, ParticleDescriptor *c1) { Loading Loading
src/libnptm/clu_subs.cpp +40 −146 Original line number Original line Diff line number Diff line Loading @@ -2288,9 +2288,6 @@ void scr2( double vk, double vkarg, double exri, double *duk, double vk, double vkarg, double exri, double *duk, ParticleDescriptor *c1 ParticleDescriptor *c1 ) { ) { #ifdef USE_NVTX nvtxRangePush("scr2 starts"); #endif const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex uim = 0.0 + 1.0 * I; const dcomplex uim = 0.0 + 1.0 * I; dcomplex s11, s21, s12, s22, rm, re, csam, cph, phas, cc; dcomplex s11, s21, s12, s22, rm, re, csam, cph, phas, cc; Loading @@ -2300,177 +2297,74 @@ void scr2( double cfsq = 4.0 / (pi4sq * ccs * ccs); double cfsq = 4.0 / (pi4sq * ccs * ccs); cph = uim * exri * vkarg; cph = uim * exri * vkarg; int ls = (c1->li < c1->le) ? c1->li : c1->le; int ls = (c1->li < c1->le) ? c1->li : c1->le; int kmax = ls*(ls+2); c1->tsas[0][0] = cc0; dcomplex *vec_rmi = c1->rmi[0]; c1->tsas[1][0] = cc0; dcomplex *vec_rei = c1->rei[0]; c1->tsas[0][1] = cc0; dcomplex *vec_w = c1->w[0]; c1->tsas[1][1] = cc0; dcomplex *vec_sas = c1->sas[0][0]; #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 1"); #endif #pragma omp parallel for for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int i = i14 - 1; int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; if (iogi >= i14) { if (iogi >= i14) { // int k = 0; int k = 0; dcomplex s11 = cc0; s11 = cc0; dcomplex s21 = cc0; s21 = cc0; dcomplex s12 = cc0; s12 = cc0; dcomplex s22 = cc0; s22 = cc0; // To parallelise, I run a linearised loop directly over k for (int l10 = 1; l10 <= ls; l10++) { // working out the algebra, it turns out that // k = l10*l10-1+im10 // we invert this to find // l10 = (int) sqrt(k+1) and im10 = k - l10*10+1 // but if it results im10 = 0, then we set l10 = l10-1 and im10 = 2*l10+1 // furthermore if it results im10 > 2*l10+1, then we set // im10 = im10 -(2*l10+1) and l10 = l10+1 (there was a rounding error in a nearly exact root) #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; if (im10 == 0) { l10--; im10 = 2*l10+1; } else if (im10 > 2*l10 + 1) { im10 -= 2*l10 + 1; l10++; } // I have all the indices in my linearised loop int l = l10 - 1; int l = l10 - 1; rm = 1.0 / c1->rmi[l][i]; re = 1.0 / c1->rei[l][i]; int ltpo = l10 + l10 + 1; for (int im10 = 1; im10 <= ltpo; im10++) { k++; int ke = k + c1->nlem; int ke = k + c1->nlem; int km1t4 = (k - 1)*4; s11 -= (c1->w[k - 1][2] * c1->w[k - 1][0] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][0] * re); int kem1t4 = (ke - 1)*4; s21 -= (c1->w[k - 1][3] * c1->w[k - 1][0] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][0] * re); dcomplex auxrm0 = vec_w[km1t4] / vec_rmi[l*c1->nsph+i]; s12 -= (c1->w[k - 1][2] * c1->w[k - 1][1] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][1] * re); dcomplex auxrm1 = vec_w[km1t4+1] / vec_rmi[l*c1->nsph+i]; s22 -= (c1->w[k - 1][3] * c1->w[k - 1][1] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][1] * re); dcomplex auxre0 = vec_w[kem1t4] / vec_rei[l*c1->nsph+i]; } // im10 loop dcomplex auxre1 = vec_w[kem1t4+1] / vec_rei[l*c1->nsph+i]; } // l10 loop s11 -= vec_w[km1t4+2] * auxrm0 + vec_w[kem1t4+2] * auxre0; c1->sas[i][0][0] = s11 * csam; s21 -= vec_w[km1t4+3] * auxrm0 + vec_w[kem1t4+3] * auxre0; c1->sas[i][1][0] = s21 * csam; s12 -= vec_w[km1t4+2] * auxrm1 + vec_w[kem1t4+2] * auxre1; c1->sas[i][0][1] = s12 * csam; s22 -= vec_w[km1t4+3] * auxrm1 + vec_w[kem1t4+3] * auxre1; c1->sas[i][1][1] = s22 * csam; } #ifdef USE_NVTX nvtxRangePop(); #endif int vecindex = i*4; vec_sas[vecindex] = s11 * csam; vec_sas[vecindex+2] = s21 * csam; vec_sas[vecindex+1] = s12 * csam; vec_sas[vecindex+3] = s22 * csam; } } // label 12 // label 12 // dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); // tsas00 += (c1->sas[iogi - 1][0][0] * phas); c1->tsas[0][0] += (c1->sas[iogi - 1][0][0] * phas); // tsas10 += (c1->sas[iogi - 1][1][0] * phas); c1->tsas[1][0] += (c1->sas[iogi - 1][1][0] * phas); // tsas01 += (c1->sas[iogi - 1][0][1] * phas); c1->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas); // tsas11 += (c1->sas[iogi - 1][1][1] * phas); c1->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); #endif dcomplex tsas00 = cc0; dcomplex tsas10 = cc0; dcomplex tsas01 = cc0; dcomplex tsas11 = cc0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); tsas00 += (c1->sas[iogi - 1][0][0] * phas); tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop } // i14 loop c1->tsas[0][0] = tsas00; c1->tsas[1][0] = tsas10; c1->tsas[0][1] = tsas01; c1->tsas[1][1] = tsas11; #ifdef USE_NVTX nvtxRangePop(); //#endif //dcomplex *vec_vints = c1->vints[0]; //#ifdef USE_NVTX nvtxRangePush("scr2 outer loop 3"); #endif #pragma omp parallel for for (int i24 = 1; i24 <= c1->nsph; i24++) { for (int i24 = 1; i24 <= c1->nsph; i24++) { int iogi = c1->iog[i24 - 1]; int iogi = c1->iog[i24 - 1]; if (iogi >= i24) { if (iogi >= i24) { // int j = 0; int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(4) // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { int j = jpo2 - 1 + (ipo2 - 1) * 2 + (jpo1 - 1) * 4 + (ipo1 - 1) * 8; j++; c1->vints[i24 - 1][j] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq; c1->vints[i24 - 1][j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * cc * cfsq; } // jpo2 loop } // jpo2 loop } // ipo2 loop } // ipo2 loop } // jpo1 loop } // jpo1 loop } // ipo1 loop } // ipo1 loop #ifdef USE_NVTX nvtxRangePop(); #endif } } } // i24 loop } // i24 loop #ifdef USE_NVTX int j = 0; nvtxRangePop(); #endif // int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for collapse(4) // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { cc = dconjg(c1->tsas[jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; j++; c1->vintt[j] = c1->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c1->tsas[jpo1 - 1][ipo1 - 1]) * cfsq; c1->vintt[j - 1] = c1->tsas[jpo2 - 1][ipo2 - 1] * cc * cfsq; } // jpo2 loop } // jpo2 loop } // ipo2 loop } // ipo2 loop } // jpo1 loop } // jpo1 loop } // ipo1 loop } // ipo1 loop #ifdef USE_NVTX nvtxRangePop(); #endif #ifdef USE_NVTX nvtxRangePop(); #endif } } void str(double **rcf, ParticleDescriptor *c1) { void str(double **rcf, ParticleDescriptor *c1) { Loading