#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
for(intj10=1;j10<=nlemt;j10++){
for(intj10=1;j10<=nlemt;j10++){
intj=j10-1;
intj=j10-1;
c1+=(vec_am0m[i*nlemt+j]*w[j][0]);
c1+=(vec_am0m[i*nlemt+j]*vec_w[4*j]);
c2+=(vec_am0m[i*nlemt+j]*w[j][1]);
c2+=(vec_am0m[i*nlemt+j]*vec_w[4*j+1]);
// c1 += (am0m[i][j] * w[j][0]);
// c2 += (am0m[i][j] * w[j][1]);
}// j10 loop
}// j10 loop
#ifdef USE_NVTX
nvtxRangePop();
#endif
vec_a[2*i]=c1;
vec_a[2*i]=c1;
vec_a[2*i+1]=c2;
vec_a[2*i+1]=c2;
// a[i][0] = c1;
// a[i][1] = c2;
}//i20 loop
}//i20 loop
intjpo=2;
#ifdef USE_NVTX
for(intipo70=1;ipo70<=2;ipo70++){
nvtxRangePop();
if(ipo70==2)jpo=1;
#endif
intipo=ipo70-1;
#ifdef USE_NVTX
nvtxRangePush("raba outer loop 2");
#endif
#pragma omp teams distribute parallel for
for(intipo=0;ipo<2;ipo++){
intjpo=1-ipo;
intjpo=1-ipo;
ctqce[ipo][0]=cc0;
ctqce[ipo][0]=cc0;
ctqce[ipo][1]=cc0;
ctqce[ipo][1]=cc0;
@@ -1975,6 +1985,9 @@ void raba(
dcomplex&tqcps2=tqcps[ipo][2];
dcomplex&tqcps2=tqcps[ipo][2];
intkmax=le*(le+2);
intkmax=le*(le+2);
// for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed)
// for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed)
#ifdef USE_NVTX
nvtxRangePush("raba inner loop 2");
#endif
#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
// for (int ai = 1; ai < nlemt; ai++) a[ai] = a[0]+ai*2;
// ctqce[1] = ctqce[0]+3;
// ctqcs[1] = ctqcs[0]+3;
// dcomplex *vec_am0m = am0m[0];
// // I cannot vectorise easily the access to w, since its size can follow either li or le, and I don't know here
// #pragma omp parallel for
// for (int i20 = 1; i20 <= nlemt; i20++) {
// int i = i20 - 1;
// dcomplex c1 = cc0;
// dcomplex c2 = cc0;
// #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
// for (int j10 = 1; j10 <= nlemt; j10++) {
// int j = j10 - 1;
// // this is actually a matrix multiplication a = am0m x w, I could substitute the whole nested loop with a single BLAS level 3 call
// c1 += (vec_am0m[i*nlemt+j] * w[j][0]);
// c2 += (vec_am0m[i*nlemt+j] * w[j][1]);
// } // j10 loop
// vec_a[2*i] = c1;
// vec_a[2*i+1] = c2;
// } //i20 loop
// //int jpo = 2;
// // for (int ipo70 = 1; ipo70 <= 2; ipo70++) {
// // if (ipo70 == 2) jpo = 1;
// // int ipo = ipo70 - 1;
// // these are 2 x 3 arrays, in principle there should be a double loop over their two indices, but the second was explicitly unrolled here. This is senseless, either unroll both indices for speed, or unroll none for clarity, doing it halfway achieves neither
// // ctqce[ipo][0] = cc0;
// // ctqce[ipo][1] = cc0;
// // ctqce[ipo][2] = cc0;
// // tqcpe[ipo][0] = cc0;
// // tqcpe[ipo][1] = cc0;
// // tqcpe[ipo][2] = cc0;
// // ctqcs[ipo][0] = cc0;
// // ctqcs[ipo][1] = cc0;
// // ctqcs[ipo][2] = cc0;
// // tqcps[ipo][0] = cc0;
// // tqcps[ipo][1] = cc0;
// // tqcps[ipo][2] = cc0;
// // so let's go for it all the way, unroll both dimensions, use auxiliary scalar variables for the reduction, to declare them in the omp pragma
// dcomplex ctqce00 = cc0;
// dcomplex ctqce01 = cc0;
// dcomplex ctqce02 = cc0;
// dcomplex ctqce10 = cc0;
// dcomplex ctqce11 = cc0;
// dcomplex ctqce12 = cc0;
// dcomplex tqcpe00 = cc0;
// dcomplex tqcpe01 = cc0;
// dcomplex tqcpe02 = cc0;
// dcomplex tqcpe10 = cc0;
// dcomplex tqcpe11 = cc0;
// dcomplex tqcpe12 = cc0;
// dcomplex ctqcs00 = cc0;
// dcomplex ctqcs01 = cc0;
// dcomplex ctqcs02 = cc0;
// dcomplex ctqcs10 = cc0;
// dcomplex ctqcs11 = cc0;
// dcomplex ctqcs12 = cc0;
// dcomplex tqcps00 = cc0;
// dcomplex tqcps01 = cc0;
// dcomplex tqcps02 = cc0;
// dcomplex tqcps10 = cc0;
// dcomplex tqcps11 = cc0;
// dcomplex tqcps12 = cc0;
// // To parallelise, I run a linearised loop directly over k
// // working out the algebra, it turns out that
// // k = l60*l60-1+im60
// // we invert this to find
// // l60 = (int) sqrt(k+1) and im60 = k - l60*60+1
// // but if it results im60 = 0, then we set l60 = l60-1 and im60 = 2*l60+1
// // furthermore if it results im60 > 2*l60+1, then we set
// // im60 = im60 -(2*l60+1) and l60 = l60+1 (there was a rounding error in a nearly exact root)
// // with the following kmax, l60 goes from 1 to le, and im60 from 1 to 2*l60+1