Loading src/libnptm/clu_subs.cpp +28 −16 Original line number Diff line number Diff line Loading @@ -2222,10 +2222,6 @@ void scr2( cph = uim * exri * vkarg; int ls = (c4->li < c4->le) ? c4->li : c4->le; int kmax = ls*(ls+2); dcomplex tsas00 = cc0; dcomplex tsas10 = cc0; dcomplex tsas01 = cc0; dcomplex tsas11 = cc0; dcomplex *vec_rmi = c1->rmi[0]; dcomplex *vec_rei = c1->rei[0]; dcomplex *vec_w = c1->w[0]; Loading @@ -2233,7 +2229,7 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 1"); #endif //#pragma omp parallel for reduction(+:tsas00, tsas10, tsas01, tsas11) #pragma omp parallel for for (int i14 = 1; i14 <= c4->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2290,27 +2286,43 @@ void scr2( vec_sas[vecindex+3] = s22 * csam; } // label 12 // dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); // tsas00 += (c1->sas[iogi - 1][0][0] * phas); // tsas10 += (c1->sas[iogi - 1][1][0] * phas); // tsas01 += (c1->sas[iogi - 1][0][1] * phas); // tsas11 += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); #endif dcomplex tsas00 = cc0; dcomplex tsas10 = cc0; dcomplex tsas01 = cc0; dcomplex tsas11 = cc0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) for (int i14 = 1; i14 <= c4->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); tsas00 += (c1->sas[iogi - 1][0][0] * phas); tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); #endif c3->tsas[0][0] = tsas00; c3->tsas[1][0] = tsas10; c3->tsas[0][1] = tsas01; c3->tsas[1][1] = tsas11; #ifdef USE_NVTX nvtxRangePop(); #endif dcomplex *vec_vints = c1->vints[0]; #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 2"); nvtxRangePush("scr2 outer loop 3"); #endif #pragma omp parallel for for (int i24 = 1; i24 <= c4->nsph; i24++) { Loading @@ -2318,7 +2330,7 @@ void scr2( if (iogi >= i24) { // int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 2"); nvtxRangePush("scr2 inner loop 3"); #endif #pragma omp target teams distribute parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { Loading @@ -2341,7 +2353,7 @@ void scr2( #endif // int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 3"); nvtxRangePush("scr2 loop 4"); #endif #pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { Loading Loading
src/libnptm/clu_subs.cpp +28 −16 Original line number Diff line number Diff line Loading @@ -2222,10 +2222,6 @@ void scr2( cph = uim * exri * vkarg; int ls = (c4->li < c4->le) ? c4->li : c4->le; int kmax = ls*(ls+2); dcomplex tsas00 = cc0; dcomplex tsas10 = cc0; dcomplex tsas01 = cc0; dcomplex tsas11 = cc0; dcomplex *vec_rmi = c1->rmi[0]; dcomplex *vec_rei = c1->rei[0]; dcomplex *vec_w = c1->w[0]; Loading @@ -2233,7 +2229,7 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 1"); #endif //#pragma omp parallel for reduction(+:tsas00, tsas10, tsas01, tsas11) #pragma omp parallel for for (int i14 = 1; i14 <= c4->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2290,27 +2286,43 @@ void scr2( vec_sas[vecindex+3] = s22 * csam; } // label 12 // dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); // tsas00 += (c1->sas[iogi - 1][0][0] * phas); // tsas10 += (c1->sas[iogi - 1][1][0] * phas); // tsas01 += (c1->sas[iogi - 1][0][1] * phas); // tsas11 += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); #endif dcomplex tsas00 = cc0; dcomplex tsas10 = cc0; dcomplex tsas01 = cc0; dcomplex tsas11 = cc0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) for (int i14 = 1; i14 <= c4->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); tsas00 += (c1->sas[iogi - 1][0][0] * phas); tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); #endif c3->tsas[0][0] = tsas00; c3->tsas[1][0] = tsas10; c3->tsas[0][1] = tsas01; c3->tsas[1][1] = tsas11; #ifdef USE_NVTX nvtxRangePop(); #endif dcomplex *vec_vints = c1->vints[0]; #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 2"); nvtxRangePush("scr2 outer loop 3"); #endif #pragma omp parallel for for (int i24 = 1; i24 <= c4->nsph; i24++) { Loading @@ -2318,7 +2330,7 @@ void scr2( if (iogi >= i24) { // int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 2"); nvtxRangePush("scr2 inner loop 3"); #endif #pragma omp target teams distribute parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { Loading @@ -2341,7 +2353,7 @@ void scr2( #endif // int j = 0; #ifdef USE_NVTX nvtxRangePush("scr2 loop 3"); nvtxRangePush("scr2 loop 4"); #endif #pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { Loading