Loading .gitlab-ci.yml +3 −3 Original line number Original line Diff line number Diff line Loading @@ -78,13 +78,13 @@ compatibility_stage: - CXX=g++-14 FC=gfortran-14 ./configure - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make wipe - make -j - make -j - echo "Running make with refinement with gnu compilers version 14..." - echo "Running make with gnu compilers version 14..." - cd .. - cd .. - rm -rf build_gnu14 - rm -rf build_gnu14 - mkdir build_gnu14_refine - mkdir build_gnu14_refine - cd build_gnu14_refine - cd build_gnu14_refine - cp -r ../build/* . - cp -r ../build/* . - CXX=g++-14 FC=gfortran-14 ./configure --enable-refinement - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make wipe - make -j - make -j #- echo "Running make with flang version 16 and clang version 16..." #- echo "Running make with flang version 16 and clang version 16..." Loading Loading @@ -173,7 +173,7 @@ building_stage: - cat /etc/os-release - cat /etc/os-release - cd build - cd build - echo "Configuring with default compilers (MAGMA disabled)..." - echo "Configuring with default compilers (MAGMA disabled)..." - ./configure --without-magma --without-cublas --disable-offload --enable-refinement --enable-shared - ./configure --without-magma --without-cublas --disable-offload --enable-shared - make wipe - make wipe - echo "Building the default configuration..." - echo "Building the default configuration..." - make -j - make -j Loading src/libnptm/clu_subs.cpp +87 −52 Original line number Original line Diff line number Diff line Loading @@ -1341,6 +1341,8 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif #endif for (int i12 = 0; i12 < nlemt; i12++) { for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; // int i = i12 - 1; Loading Loading @@ -1408,6 +1410,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { sum3 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else #pragma omp parallel for simd reduction(+:sum2,sum3) #endif #endif for (int i14 = 0; i14 < c1->nlem; i14++) { for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; int ie = i14 + c1->nlem; Loading @@ -1418,6 +1422,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { dcomplex sumpd = cc0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { Loading Loading @@ -2001,6 +2007,8 @@ void raba( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else #pragma omp parallel for simd reduction(+:c1, c2) #endif #endif for (int j10 = 1; j10 <= nlemt; j10++) { for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; int j = j10 - 1; Loading @@ -2021,6 +2029,8 @@ void raba( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for #pragma omp teams distribute parallel for #else #pragma omp parallel for #endif #endif for (int ipo = 0; ipo < 2; ipo++) { for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; int jpo = 1 - ipo; Loading Loading @@ -2055,6 +2065,8 @@ void raba( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int l60 = (int) sqrt(k+1); Loading Loading @@ -2129,7 +2141,9 @@ void raba( nvtxRangePush("raba loop 3"); nvtxRangePush("raba loop 3"); #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for simd #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for simd #endif #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; int ipo = ipo78 - 1; Loading Loading @@ -2202,6 +2216,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else #pragma omp parallel for simd reduction(+:sums, sum21) #endif #endif for (int l10 = 1; l10 <= c1->li; l10++) { for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); double fl = 1.0 * (l10 + l10 + 1); Loading Loading @@ -2248,6 +2264,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2312,6 +2330,8 @@ void scr2( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int l10 = (int) sqrt(k+1); Loading Loading @@ -2366,6 +2386,8 @@ void scr2( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int i = i14 - 1; Loading Loading @@ -2398,6 +2420,8 @@ void scr2( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) #pragma omp target teams distribute parallel for simd collapse(4) #else #pragma omp parallel for simd collapse(4) #endif #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { Loading @@ -2422,7 +2446,9 @@ void scr2( nvtxRangePush("scr2 loop 4"); nvtxRangePush("scr2 loop 4"); #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(4) #pragma omp target teams distribute parallel for collapse(4) #else #pragma omp parallel for collapse(4) #endif #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { Loading Loading @@ -2534,7 +2560,7 @@ void tqr( } } void ztm(dcomplex **am, ParticleDescriptor *c1) { void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I; // int i2 = 0; // old implementation // int i2 = 0; // old implementation #ifdef USE_NVTX #ifdef USE_NVTX Loading @@ -2546,7 +2572,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // C9 *c9_para = new C9(*c9); // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c1->gis[0]; dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; dcomplex *gls_v = c1->gls[0]; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); int k2max = c1->li*(c1->li+2); int k2max = c1->li*(c1->li+2); int k3max = c1->le*(c1->le+2); int k3max = c1->le*(c1->le+2); // To parallelise, I run a linearised loop directly over k // To parallelise, I run a linearised loop directly over k Loading @@ -2559,6 +2584,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #pragma omp target teams distribute parallel for simd collapse(3) #else #pragma omp parallel for simd collapse(3) #endif #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k2 = 1; k2<=k2max; k2++) { Loading Loading @@ -2590,12 +2617,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i3 = l3 * l3 + im3 - 1; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); free(rac3j_local); } // close k3 loop, former l3 + im3 loops } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop } // close n2 loop free(rac3j_local); #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif #endif Loading @@ -2606,6 +2634,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex *sam_v = c1->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { for (int i3 = 1; i3 <= c1->nlem; i3++) { Loading @@ -2615,7 +2645,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex sum4 = cc0; dcomplex sum4 = cc0; int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int i3e = i3 + c1->nlem; int i3e = i3 + c1->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= c1->ndi; i2++) { for (int i2 = 1; i2 <= c1->ndi; i2++) { int i2e = i2 + c1->ndi; int i2e = i2 + c1->ndi; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; Loading @@ -2640,7 +2669,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i3 loop } // i1 loop } // i1 loop #pragma omp parallel for collapse(2) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2650,7 +2683,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i1 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { Loading @@ -2661,11 +2696,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; a1 = sam_v[vecind1 + i3 - 1]; dcomplex a1 = sam_v[vecind1 + i3 - 1]; a2 = sam_v[vecind1e + i3 - 1]; dcomplex a2 = sam_v[vecind1e + i3 - 1]; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gie = gis_v[vecindex]; dcomplex gie = gis_v[vecindex]; gle = gls_v[vecindex]; dcomplex gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); sum2 += (a1 * gle + a2 * gie); } // i1 loop } // i1 loop Loading src/scripts/model_maker.py +5 −3 Original line number Original line Diff line number Diff line Loading @@ -328,6 +328,8 @@ def load_model(model_file): for j in range(expected_radii): for j in range(expected_radii): sconf['rcf'][i][j] = float(model['particle_settings']['rad_frac'][i][j]) sconf['rcf'][i][j] = float(model['particle_settings']['rad_frac'][i][j]) # Create the gconf dict # Create the gconf dict use_refinement = False dyn_orders = True try: try: use_refinement = False if model['system_settings']['refinement'] == "0" else True use_refinement = False if model['system_settings']['refinement'] == "0" else True except KeyError: except KeyError: Loading @@ -335,7 +337,7 @@ def load_model(model_file): try: try: dyn_orders = False if model['radiation_settings']['dyn_orders'] == "0" else True dyn_orders = False if model['radiation_settings']['dyn_orders'] == "0" else True except KeyError: except KeyError: use_refinement = False dyn_orders = True str_polar = model['radiation_settings']['polarization'] str_polar = model['radiation_settings']['polarization'] if (str_polar not in ["LINEAR", "CIRCULAR"]): if (str_polar not in ["LINEAR", "CIRCULAR"]): print("ERROR: %s is not a recognized polarization state."%str_polar) print("ERROR: %s is not a recognized polarization state."%str_polar) Loading Loading @@ -875,10 +877,10 @@ def write_legacy_gconf(conf): output.write(str_line) output.write(str_line) str_line = " {0:d}\n0\n".format(conf['jwtm']) str_line = " {0:d}\n0\n".format(conf['jwtm']) output.write(str_line) output.write(str_line) if (gconf['use_refinement']): if (conf['use_refinement']): str_line = "USE_REFINEMENT=1\n" str_line = "USE_REFINEMENT=1\n" output.write(str_line) output.write(str_line) if (not gconf['dyn_orders']): if (not conf['dyn_orders']): str_line = "USE_DYN_ORDER=0\n" str_line = "USE_DYN_ORDER=0\n" output.write(str_line) output.write(str_line) output.close() output.close() Loading src/trapping/cfrfme.cpp +57 −43 Original line number Original line Diff line number Diff line Loading @@ -61,6 +61,14 @@ #include <nvtx3/nvToolsExt.h> #include <nvtx3/nvToolsExt.h> #endif #endif #ifdef _OPENMP #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; using namespace std; /*! \brief C++ implementation of FRFME /*! \brief C++ implementation of FRFME Loading Loading @@ -391,58 +399,64 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("j80 loop"); nvtxRangePush("j80 loop"); #endif #endif #pragma omp parallel for dcomplex *vec_wsum = tfrfme->wsum[0]; for (int j80 = jlmf; j80 <= jlml; j80++) { double *vec_vkzm = vkzm[0]; dcomplex *vec_w = new dcomplex[nkv * nkv](); #ifdef USE_TARGET_OFFLOAD dcomplex **w = new dcomplex*[nkv]; #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for simd #endif for (int j80 = jlmf-1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; dcomplex wk_value; for (int jy50 = 0; jy50 < nkv; jy50++) { int wk_index = 0; for (int jx50 = 0; jx50 < nkv; jx50++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { wk_value = tt1->wk[jy50 * nkv * nlmmt + jx50 * nlmmt + j80 - 1]; wk_index = nlmmt*jxy50; w[jx50][jy50] = wk_value; wk_value = tt1->wk[wk_index + j80]; } // jx50 int jy50 = jxy50 / nkv; } // jy50 loop int jx50 = jxy50 % nkv; vec_w[(nkv*jx50) + jy50] = wk_value; } // jxy50 loop int ixyz = 0; int ixyz = 0; for (int iz75 = 0; iz75 < nzv; iz75++) { for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0; int nvtot = nxv*nyv*nzv; int nvxy = nxv *nyv; for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; double z = _zv[iz75] + frsh; double z = _zv[iz75] + frsh; for (int iy70 = 0; iy70 < nyv; iy70++) { double y = _yv[iy70]; double y = _yv[iy70]; for (int ix65 = 0; ix65 < nxv; ix65++) { double x = _xv[ix65]; double x = _xv[ix65]; ixyz++; dcomplex sumy = cc0; dcomplex sumy = cc0; for (int jy60 = 0; jy60 < nkv; jy60++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; double vky = vkv[jy60]; double vky = vkv[jy60]; double vkx = vkv[nkv - 1]; double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55]; double vkzf = vkzm[0][jy60]; double vkz = vec_vkzm[(jx55*nkv)+jy60]; dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z)); dcomplex phas = (jx55==0) ? double vkzl = vkzm[nkv - 1][jy60]; cexp(uim * (-vkx * x + vky * y + vkz * z)): dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z)); cexp(uim * (vkx * x + vky * y + vkz * z)); dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl); dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; for (int jx55 = 2; jx55 <= nks; jx55++) { double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0; vkx = vkv[jx55 - 1]; double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0; double vkz = vkzm[jx55 - 1][jy60]; sumx *= factor1*factor2; dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z)); sumx += (w[jx55 - 1][jy60] * phas); } // jx55 loop if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5; sumy += sumx; sumy += sumx; } // jy60 loop } // jy60x55 loop tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks; vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; } // ix65 loop } // ixyz loop } // iy70 loop free(vec_w); } // iz75 loop free(w); delete[] vec_w; delete[] w; } // j80 loop } // j80 loop #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif // label 88 #ifdef USE_NVTX nvtxRangePush("Closing operations"); nvtxRangePush("Closing operations"); #endif #endif // label 88 tfrfme->write_binary(tfrfme_name, "HDF5"); tfrfme->write_binary(tfrfme_name, "HDF5"); string output_name = output_path + "/c_OFRFME"; string output_name = output_path + "/c_OFRFME"; FILE *output = fopen(output_name.c_str(), "w"); FILE *output = fopen(output_name.c_str(), "w"); Loading build/configure.sh +1 −1 File changed.Contains only whitespace changes. Show changes Loading
.gitlab-ci.yml +3 −3 Original line number Original line Diff line number Diff line Loading @@ -78,13 +78,13 @@ compatibility_stage: - CXX=g++-14 FC=gfortran-14 ./configure - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make wipe - make -j - make -j - echo "Running make with refinement with gnu compilers version 14..." - echo "Running make with gnu compilers version 14..." - cd .. - cd .. - rm -rf build_gnu14 - rm -rf build_gnu14 - mkdir build_gnu14_refine - mkdir build_gnu14_refine - cd build_gnu14_refine - cd build_gnu14_refine - cp -r ../build/* . - cp -r ../build/* . - CXX=g++-14 FC=gfortran-14 ./configure --enable-refinement - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make wipe - make -j - make -j #- echo "Running make with flang version 16 and clang version 16..." #- echo "Running make with flang version 16 and clang version 16..." Loading Loading @@ -173,7 +173,7 @@ building_stage: - cat /etc/os-release - cat /etc/os-release - cd build - cd build - echo "Configuring with default compilers (MAGMA disabled)..." - echo "Configuring with default compilers (MAGMA disabled)..." - ./configure --without-magma --without-cublas --disable-offload --enable-refinement --enable-shared - ./configure --without-magma --without-cublas --disable-offload --enable-shared - make wipe - make wipe - echo "Building the default configuration..." - echo "Building the default configuration..." - make -j - make -j Loading
src/libnptm/clu_subs.cpp +87 −52 Original line number Original line Diff line number Diff line Loading @@ -1341,6 +1341,8 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif #endif for (int i12 = 0; i12 < nlemt; i12++) { for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; // int i = i12 - 1; Loading Loading @@ -1408,6 +1410,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { sum3 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else #pragma omp parallel for simd reduction(+:sum2,sum3) #endif #endif for (int i14 = 0; i14 < c1->nlem; i14++) { for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; int ie = i14 + c1->nlem; Loading @@ -1418,6 +1422,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { dcomplex sumpd = cc0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { Loading Loading @@ -2001,6 +2007,8 @@ void raba( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else #pragma omp parallel for simd reduction(+:c1, c2) #endif #endif for (int j10 = 1; j10 <= nlemt; j10++) { for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; int j = j10 - 1; Loading @@ -2021,6 +2029,8 @@ void raba( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for #pragma omp teams distribute parallel for #else #pragma omp parallel for #endif #endif for (int ipo = 0; ipo < 2; ipo++) { for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; int jpo = 1 - ipo; Loading Loading @@ -2055,6 +2065,8 @@ void raba( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int l60 = (int) sqrt(k+1); Loading Loading @@ -2129,7 +2141,9 @@ void raba( nvtxRangePush("raba loop 3"); nvtxRangePush("raba loop 3"); #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for simd #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for simd #endif #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; int ipo = ipo78 - 1; Loading Loading @@ -2202,6 +2216,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else #pragma omp parallel for simd reduction(+:sums, sum21) #endif #endif for (int l10 = 1; l10 <= c1->li; l10++) { for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); double fl = 1.0 * (l10 + l10 + 1); Loading Loading @@ -2248,6 +2264,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2312,6 +2330,8 @@ void scr2( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int l10 = (int) sqrt(k+1); Loading Loading @@ -2366,6 +2386,8 @@ void scr2( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int i = i14 - 1; Loading Loading @@ -2398,6 +2420,8 @@ void scr2( #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) #pragma omp target teams distribute parallel for simd collapse(4) #else #pragma omp parallel for simd collapse(4) #endif #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { Loading @@ -2422,7 +2446,9 @@ void scr2( nvtxRangePush("scr2 loop 4"); nvtxRangePush("scr2 loop 4"); #endif #endif #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(4) #pragma omp target teams distribute parallel for collapse(4) #else #pragma omp parallel for collapse(4) #endif #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { Loading Loading @@ -2534,7 +2560,7 @@ void tqr( } } void ztm(dcomplex **am, ParticleDescriptor *c1) { void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I; // int i2 = 0; // old implementation // int i2 = 0; // old implementation #ifdef USE_NVTX #ifdef USE_NVTX Loading @@ -2546,7 +2572,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // C9 *c9_para = new C9(*c9); // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c1->gis[0]; dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; dcomplex *gls_v = c1->gls[0]; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); int k2max = c1->li*(c1->li+2); int k2max = c1->li*(c1->li+2); int k3max = c1->le*(c1->le+2); int k3max = c1->le*(c1->le+2); // To parallelise, I run a linearised loop directly over k // To parallelise, I run a linearised loop directly over k Loading @@ -2559,6 +2584,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #pragma omp target teams distribute parallel for simd collapse(3) #else #pragma omp parallel for simd collapse(3) #endif #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k2 = 1; k2<=k2max; k2++) { Loading Loading @@ -2590,12 +2617,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i3 = l3 * l3 + im3 - 1; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); free(rac3j_local); } // close k3 loop, former l3 + im3 loops } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop } // close n2 loop free(rac3j_local); #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif #endif Loading @@ -2606,6 +2634,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex *sam_v = c1->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { for (int i3 = 1; i3 <= c1->nlem; i3++) { Loading @@ -2615,7 +2645,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex sum4 = cc0; dcomplex sum4 = cc0; int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int i3e = i3 + c1->nlem; int i3e = i3 + c1->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= c1->ndi; i2++) { for (int i2 = 1; i2 <= c1->ndi; i2++) { int i2e = i2 + c1->ndi; int i2e = i2 + c1->ndi; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; Loading @@ -2640,7 +2669,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i3 loop } // i1 loop } // i1 loop #pragma omp parallel for collapse(2) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2650,7 +2683,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i1 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(2) #pragma omp target teams distribute parallel for simd collapse(2) #else #pragma omp parallel for simd collapse(2) #endif #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { Loading @@ -2661,11 +2696,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i1e = i1 + c1->ndi; int i1e = i1 + c1->ndi; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; a1 = sam_v[vecind1 + i3 - 1]; dcomplex a1 = sam_v[vecind1 + i3 - 1]; a2 = sam_v[vecind1e + i3 - 1]; dcomplex a2 = sam_v[vecind1e + i3 - 1]; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gie = gis_v[vecindex]; dcomplex gie = gis_v[vecindex]; gle = gls_v[vecindex]; dcomplex gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); sum2 += (a1 * gle + a2 * gie); } // i1 loop } // i1 loop Loading
src/scripts/model_maker.py +5 −3 Original line number Original line Diff line number Diff line Loading @@ -328,6 +328,8 @@ def load_model(model_file): for j in range(expected_radii): for j in range(expected_radii): sconf['rcf'][i][j] = float(model['particle_settings']['rad_frac'][i][j]) sconf['rcf'][i][j] = float(model['particle_settings']['rad_frac'][i][j]) # Create the gconf dict # Create the gconf dict use_refinement = False dyn_orders = True try: try: use_refinement = False if model['system_settings']['refinement'] == "0" else True use_refinement = False if model['system_settings']['refinement'] == "0" else True except KeyError: except KeyError: Loading @@ -335,7 +337,7 @@ def load_model(model_file): try: try: dyn_orders = False if model['radiation_settings']['dyn_orders'] == "0" else True dyn_orders = False if model['radiation_settings']['dyn_orders'] == "0" else True except KeyError: except KeyError: use_refinement = False dyn_orders = True str_polar = model['radiation_settings']['polarization'] str_polar = model['radiation_settings']['polarization'] if (str_polar not in ["LINEAR", "CIRCULAR"]): if (str_polar not in ["LINEAR", "CIRCULAR"]): print("ERROR: %s is not a recognized polarization state."%str_polar) print("ERROR: %s is not a recognized polarization state."%str_polar) Loading Loading @@ -875,10 +877,10 @@ def write_legacy_gconf(conf): output.write(str_line) output.write(str_line) str_line = " {0:d}\n0\n".format(conf['jwtm']) str_line = " {0:d}\n0\n".format(conf['jwtm']) output.write(str_line) output.write(str_line) if (gconf['use_refinement']): if (conf['use_refinement']): str_line = "USE_REFINEMENT=1\n" str_line = "USE_REFINEMENT=1\n" output.write(str_line) output.write(str_line) if (not gconf['dyn_orders']): if (not conf['dyn_orders']): str_line = "USE_DYN_ORDER=0\n" str_line = "USE_DYN_ORDER=0\n" output.write(str_line) output.write(str_line) output.close() output.close() Loading
src/trapping/cfrfme.cpp +57 −43 Original line number Original line Diff line number Diff line Loading @@ -61,6 +61,14 @@ #include <nvtx3/nvToolsExt.h> #include <nvtx3/nvToolsExt.h> #endif #endif #ifdef _OPENMP #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; using namespace std; /*! \brief C++ implementation of FRFME /*! \brief C++ implementation of FRFME Loading Loading @@ -391,58 +399,64 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("j80 loop"); nvtxRangePush("j80 loop"); #endif #endif #pragma omp parallel for dcomplex *vec_wsum = tfrfme->wsum[0]; for (int j80 = jlmf; j80 <= jlml; j80++) { double *vec_vkzm = vkzm[0]; dcomplex *vec_w = new dcomplex[nkv * nkv](); #ifdef USE_TARGET_OFFLOAD dcomplex **w = new dcomplex*[nkv]; #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for simd #endif for (int j80 = jlmf-1; j80 < jlml; j80++) { int nkvs = nkv * nkv; dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; dcomplex wk_value; for (int jy50 = 0; jy50 < nkv; jy50++) { int wk_index = 0; for (int jx50 = 0; jx50 < nkv; jx50++) { for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { wk_value = tt1->wk[jy50 * nkv * nlmmt + jx50 * nlmmt + j80 - 1]; wk_index = nlmmt*jxy50; w[jx50][jy50] = wk_value; wk_value = tt1->wk[wk_index + j80]; } // jx50 int jy50 = jxy50 / nkv; } // jy50 loop int jx50 = jxy50 % nkv; vec_w[(nkv*jx50) + jy50] = wk_value; } // jxy50 loop int ixyz = 0; int ixyz = 0; for (int iz75 = 0; iz75 < nzv; iz75++) { for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0; int nvtot = nxv*nyv*nzv; int nvxy = nxv *nyv; for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; int ix65 = ixyz % nxv; double z = _zv[iz75] + frsh; double z = _zv[iz75] + frsh; for (int iy70 = 0; iy70 < nyv; iy70++) { double y = _yv[iy70]; double y = _yv[iy70]; for (int ix65 = 0; ix65 < nxv; ix65++) { double x = _xv[ix65]; double x = _xv[ix65]; ixyz++; dcomplex sumy = cc0; dcomplex sumy = cc0; for (int jy60 = 0; jy60 < nkv; jy60++) { for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; double vky = vkv[jy60]; double vky = vkv[jy60]; double vkx = vkv[nkv - 1]; double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55]; double vkzf = vkzm[0][jy60]; double vkz = vec_vkzm[(jx55*nkv)+jy60]; dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z)); dcomplex phas = (jx55==0) ? double vkzl = vkzm[nkv - 1][jy60]; cexp(uim * (-vkx * x + vky * y + vkz * z)): dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z)); cexp(uim * (vkx * x + vky * y + vkz * z)); dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl); dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; for (int jx55 = 2; jx55 <= nks; jx55++) { double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0; vkx = vkv[jx55 - 1]; double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0; double vkz = vkzm[jx55 - 1][jy60]; sumx *= factor1*factor2; dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z)); sumx += (w[jx55 - 1][jy60] * phas); } // jx55 loop if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5; sumy += sumx; sumy += sumx; } // jy60 loop } // jy60x55 loop tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks; vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; } // ix65 loop } // ixyz loop } // iy70 loop free(vec_w); } // iz75 loop free(w); delete[] vec_w; delete[] w; } // j80 loop } // j80 loop #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePop(); nvtxRangePop(); #endif // label 88 #ifdef USE_NVTX nvtxRangePush("Closing operations"); nvtxRangePush("Closing operations"); #endif #endif // label 88 tfrfme->write_binary(tfrfme_name, "HDF5"); tfrfme->write_binary(tfrfme_name, "HDF5"); string output_name = output_path + "/c_OFRFME"; string output_name = output_path + "/c_OFRFME"; FILE *output = fopen(output_name.c_str(), "w"); FILE *output = fopen(output_name.c_str(), "w"); Loading