Loading build/configure.sh +6 −6 Original line number Original line Diff line number Diff line Loading @@ -364,8 +364,8 @@ result=$? if [ "x$result" = "x0" ]; then if [ "x$result" = "x0" ]; then CLANGFLAGS=" -stdlib=libstdc++" CLANGFLAGS=" -stdlib=libstdc++" fi fi echo -n "configure: checking wether $CXX works... " echo -n "configure: checking whether $CXX works... " echo -n "configure: checking wether $CXX works... " >>configure.log echo -n "configure: checking whether $CXX works... " >>configure.log cat > test_compiler.cpp <<EOF cat > test_compiler.cpp <<EOF int main() { int main() { int i = -1; int i = -1; Loading @@ -385,8 +385,8 @@ else echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log exit 2 exit 2 fi fi echo -n "configure: checking wether $CXX supports -ggdb... " echo -n "configure: checking whether $CXX supports -ggdb... " echo -n "configure: checking wether $CXX supports -ggdb... " >>configure.log echo -n "configure: checking whether $CXX supports -ggdb... " >>configure.log $CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log $CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log result=$? result=$? if [ "x$result" = "x0" ]; then if [ "x$result" = "x0" ]; then Loading @@ -410,8 +410,8 @@ else echo "no" echo "no" echo "no" >>configure.log echo "no" >>configure.log fi fi echo -n "configure: checking wether $CXX is a MPI compiler... " echo -n "configure: checking whether $CXX is a MPI compiler... " echo -n "configure: checking wether $CXX is a MPI compiler... " >>configure.log echo -n "configure: checking whether $CXX is a MPI compiler... " >>configure.log cat > test_compiler.cpp <<EOF cat > test_compiler.cpp <<EOF # include <mpi.h> # include <mpi.h> int main() { int main() { Loading src/include/tfrfme.h +20 −30 Original line number Original line Diff line number Diff line Loading @@ -28,7 +28,7 @@ class Swap1 { protected: protected: //! Index of the last element to be filled. //! Index of the last element to be filled. int _last_index; int _last_index; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave-numbers. int _nkv; int _nkv; //! NLMMT = 2 * LM * (LM + 2) //! NLMMT = 2 * LM * (LM + 2) int _nlmmt; int _nlmmt; Loading Loading @@ -69,7 +69,7 @@ public: /*! \brief Swap1 instance constructor. /*! \brief Swap1 instance constructor. * * * \param lm: `int` Maximum field expansion order. * \param lm: `int` Maximum field expansion order. * \param _nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ */ Swap1(int lm, int nkv); Swap1(int lm, int nkv); Loading Loading @@ -130,10 +130,8 @@ protected: int _last_vector; int _last_vector; //! Index of the last matrix element to be filled. //! Index of the last matrix element to be filled. int _last_matrix; int _last_matrix; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave numbers. int _nkv; int _nkv; //! Contiguous vector of VKZM matrix. double *vec_vkzm; //! QUESTION: definition? //! QUESTION: definition? double _apfafa; double _apfafa; //! QUESTION: definition? //! QUESTION: definition? Loading @@ -152,13 +150,13 @@ protected: double _delxyz; double _delxyz; //! QUESTION: definition? //! QUESTION: definition? double _vknmx; double _vknmx; //! QUESTION: definition? //! Wave number grid spacing. double _delk; double _delk; //! QUESTION: definition? //! Square of wave number grid spacing. double _delks; double _delks; //! NLMMT = LM * (LM + 2) * 2 //! NLMMT = LM * (LM + 2) * 2 int _nlmmt; int _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Number of radial vector coordinates. int _nrvc; int _nrvc; /*! \brief Load a Swap2 instance from a HDF5 binary file. /*! \brief Load a Swap2 instance from a HDF5 binary file. Loading Loading @@ -192,12 +190,12 @@ public: const int &last_vector = _last_vector; const int &last_vector = _last_vector; //! Read-only view on the index of the last matrix element to be filled. //! Read-only view on the index of the last matrix element to be filled. const int &last_matrix = _last_matrix; const int &last_matrix = _last_matrix; //! Read-only view on the number of vector coordinates. QUESTION: correct? //! Read-only view on the number of beam description wave numbers. const int &nkv = _nkv; const int &nkv = _nkv; //! QUESTION: definition? //! QUESTION: definition? double *vkv; double *vkv; //! QUESTION: definition? //! QUESTION: definition? double **vkzm; double *vec_vkzm; //! QUESTION: definition? //! QUESTION: definition? const double &apfafa = _apfafa; const double &apfafa = _apfafa; //! QUESTION: definition? //! QUESTION: definition? Loading @@ -222,12 +220,12 @@ public: const double &delks = _delks; const double &delks = _delks; //! NLMMT = LM * (LM + 2) * 2 //! NLMMT = LM * (LM + 2) * 2 const int &nlmmt = _nlmmt; const int &nlmmt = _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Read-only view on the number of radial vector coordinates. const int &nrvc = _nrvc; const int &nrvc = _nrvc; /*! \brief Swap2 instance constructor. /*! \brief Swap2 instance constructor. * * * \param nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ */ Swap2(int nkv); Swap2(int nkv); Loading @@ -244,15 +242,9 @@ public: */ */ static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY"); static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY"); /*! \brief Get the pointer to the VKZM matrix. * * \return value: `double **` Pointer to the VKZM matrix. */ double **get_matrix() { return vkzm; } /*! \brief Calculate the necessary amount of memory to create a new instance. /*! \brief Calculate the necessary amount of memory to create a new instance. * * * \param nkv: `int` Number of radial vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. * \return size: `long` The necessary memory size in bytes. * \return size: `long` The necessary memory size in bytes. */ */ static long get_size(int nkv); static long get_size(int nkv); Loading Loading @@ -316,11 +308,11 @@ protected: int _nlmmt; int _nlmmt; //! NRVC = NXV * NYV * NZV //! NRVC = NXV * NYV * NZV int _nrvc; int _nrvc; //! Field expansion mode identifier. //! Beam description mode. int _lmode; int _lmode; //! Maximum field expansion order. //! Maximum field expansion order. int _lm; int _lm; //! QUESTION: definition? //! Number of beam description wave numbers. int _nkv; int _nkv; //! Number of computed X coordinates. //! Number of computed X coordinates. int _nxv; int _nxv; Loading @@ -332,11 +324,11 @@ protected: double _vk; double _vk; //! External medium refractive index //! External medium refractive index double _exri; double _exri; //! QUESTION: definition? //! Numerical aperture. double _an; double _an; //! QUESTION: definition? //! Filling factor. double _ff; double _ff; //! QUESTION: definition? //! Lens transmission. double _tra; double _tra; //! QUESTION: definition? //! QUESTION: definition? double _spd; double _spd; Loading @@ -350,8 +342,6 @@ protected: double *yv; double *yv; //! Vector of computed z positions //! Vector of computed z positions double *zv; double *zv; //! QUESTION: definition? dcomplex *vec_wsum; /*! \brief Load a configuration instance from a HDF5 binary file. /*! \brief Load a configuration instance from a HDF5 binary file. * * Loading Loading @@ -402,11 +392,11 @@ public: const double& vk = _vk; const double& vk = _vk; //! Read-only view on external medium refractive index //! Read-only view on external medium refractive index const double& exri = _exri; const double& exri = _exri; //! QUESTION: definition? //! Read-only view on numeric aperture. const double& an = _an; const double& an = _an; //! QUESTION: definition? //! Read-only view on filling factor. const double& ff = _ff; const double& ff = _ff; //! QUESTION: definition? //! Read-only view on lens transmission. const double& tra = _tra; const double& tra = _tra; //! QUESTION: definition? //! QUESTION: definition? const double& spd = _spd; const double& spd = _spd; Loading @@ -415,7 +405,7 @@ public: //! QUESTION: definition? //! QUESTION: definition? const double& exril = _exril; const double& exril = _exril; //! QUESTION: definition? //! QUESTION: definition? dcomplex **wsum; dcomplex *vec_wsum; /*! \brief Trapping configuration instance constructor. /*! \brief Trapping configuration instance constructor. * * Loading src/libnptm/Configuration.cpp +4 −2 Original line number Original line Diff line number Diff line Loading @@ -340,13 +340,15 @@ GeometryConfiguration* GeometryConfiguration::from_legacy(const std::string& fil // Read optional configuration data used only by the C++ code. // Read optional configuration data used only by the C++ code. while (num_lines > last_read_line) { while (num_lines > last_read_line) { str_target = file_lines[last_read_line++]; str_target = file_lines[last_read_line++]; if (str_target.size() > 0) { if (str_target.size() > 15) { if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) { if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) { regex_search(str_target, m, re); regex_search(str_target, m, re); short refine_flag = (short)stoi(m.str()); short refine_flag = (short)stoi(m.str()); conf->_refine_flag = refine_flag; conf->_refine_flag = refine_flag; } } else if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { } if (str_target.size() > 14) { if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { regex_search(str_target, m, re); regex_search(str_target, m, re); short dyn_order_flag = (short)stoi(m.str()); short dyn_order_flag = (short)stoi(m.str()); conf->_dyn_order_flag = dyn_order_flag; conf->_dyn_order_flag = dyn_order_flag; Loading src/libnptm/clu_subs.cpp +115 −102 Original line number Original line Diff line number Diff line Loading @@ -47,10 +47,6 @@ #include <omp.h> #include <omp.h> #endif #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; using namespace std; void apc( void apc( Loading Loading @@ -407,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif double cgev(int ipamo, int mu, int l, int m) { double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double result = 0.0; double xd = 0.0, xn = 0.0; double xd = 0.0, xn = 0.0; Loading Loading @@ -443,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) { } } return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif void cms(dcomplex **am, ParticleDescriptor *c1) { void cms(dcomplex **am, ParticleDescriptor *c1) { dcomplex dm, de, cgh, cgk; dcomplex dm, de, cgh, cgk; Loading Loading @@ -649,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; delete[] svs; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif dcomplex ghit_d( dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j ParticleDescriptor *c1, double *rac3j Loading Loading @@ -862,13 +858,13 @@ dcomplex ghit_d( } } return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif dcomplex ghit( dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 ParticleDescriptor *c1 Loading Loading @@ -1079,9 +1075,9 @@ dcomplex ghit( } } return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif void hjv( void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -1339,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("pcros intermediate loop 1"); nvtxRangePush("pcros intermediate loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else // #else // #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #endif #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; // int i = i12 - 1; dcomplex am = cc0; dcomplex am = cc0; Loading Loading @@ -1408,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { csam = -(ccs / (exri * vk)) * 0.5 * I; csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum2 = cc0; sum3 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) // #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else // #else // #pragma omp parallel for simd reduction(+:sum2,sum3) // #endif #pragma omp parallel for simd reduction(+:sum2,sum3) #pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; int ie = i14 + c1->nlem; sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); Loading @@ -1420,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { } // i14 loop } // i14 loop double sumpi = 0.0; double sumpi = 0.0; dcomplex sumpd = cc0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else // #else // #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #endif #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { int je = j16 + c1->nlem; int je = j16 + c1->nlem; Loading Loading @@ -1628,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) { } } } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jmx = j3 + j2; int jdf = j3 - j2; int jdf = j3 - j2; Loading Loading @@ -1748,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { } } } } } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jmx = j3 + j2; int jdf = j3 - j2; int jdf = j3 - j2; Loading Loading @@ -1872,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { } } } } } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; Loading Loading @@ -2005,11 +2004,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba inner loop 1"); nvtxRangePush("raba inner loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) // #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else // #else // #pragma omp parallel for simd reduction(+:c1, c2) // #endif #pragma omp parallel for simd reduction(+:c1, c2) #pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; int j = j10 - 1; c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); Loading @@ -2027,11 +2027,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba outer loop 2"); nvtxRangePush("raba outer loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for // #pragma omp teams distribute parallel for #else // #else // #pragma omp parallel for // #endif #pragma omp parallel for #pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; int jpo = 1 - ipo; ctqce[ipo][0] = cc0; ctqce[ipo][0] = cc0; Loading Loading @@ -2063,11 +2064,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba inner loop 2"); nvtxRangePush("raba inner loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else // #else // #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #endif #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; int im60 = k - (l60*l60) + 1; Loading Loading @@ -2140,11 +2142,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba loop 3"); nvtxRangePush("raba loop 3"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd // #pragma omp target teams distribute parallel for simd #else // #else // #pragma omp parallel for simd // #endif #pragma omp parallel for simd #pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; Loading Loading @@ -2214,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr0 inner loop 1"); nvtxRangePush("scr0 inner loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) // #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else // #else // #pragma omp parallel for simd reduction(+:sums, sum21) // #endif #pragma omp parallel for simd reduction(+:sums, sum21) #pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); double fl = 1.0 * (l10 + l10 + 1); // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; Loading Loading @@ -2262,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr0 loop 2"); nvtxRangePush("scr0 loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) // #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else // #else // #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) // #endif #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; scs += c1->sscs[iogi - 1]; Loading Loading @@ -2328,11 +2333,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); nvtxRangePush("scr2 inner loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; int im10 = k - (l10*l10) + 1; Loading Loading @@ -2384,11 +2390,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); nvtxRangePush("scr2 loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int i = i14 - 1; int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2418,11 +2425,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); nvtxRangePush("scr2 inner loop 3"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) // #pragma omp target teams distribute parallel for simd collapse(4) #else // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) #pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading @@ -2445,11 +2453,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); nvtxRangePush("scr2 loop 4"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for collapse(4) // #pragma omp target teams distribute parallel for collapse(4) #else // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) #pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading Loading @@ -2582,11 +2591,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) // #pragma omp target teams distribute parallel for simd collapse(3) #else // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) #pragma omp parallel for simd collapse(3) #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2632,11 +2642,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif #endif dcomplex *am_v = am[0]; dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) // #pragma omp target teams distribute parallel for simd collapse(2) #else // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; dcomplex sum1 = cc0; Loading Loading @@ -2669,11 +2680,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i3 loop } // i1 loop } // i1 loop #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) // #pragma omp target teams distribute parallel for simd collapse(2) #else // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2682,11 +2694,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i0 loop } // i0 loop } // i1 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) // #pragma omp target teams distribute parallel for simd collapse(2) #else // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #pragma omp parallel for simd collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; int i0e = i0 + c1->nlem; Loading src/libnptm/tfrfme.cpp +54 −23 Original line number Original line Diff line number Diff line Loading @@ -44,6 +44,10 @@ #include "../include/file_io.h" #include "../include/file_io.h" #endif #endif #ifdef USE_TARGET_OFFLOAD #include <cstdlib> #endif using namespace std; using namespace std; // >>> START OF Swap1 CLASS IMPLEMENTATION <<< // >>> START OF Swap1 CLASS IMPLEMENTATION <<< Loading Loading @@ -220,18 +224,32 @@ bool Swap1::operator ==(Swap1 &other) { // >>> START OF Swap2 CLASS IMPLEMENTATION <<< // >>> START OF Swap2 CLASS IMPLEMENTATION <<< Swap2::Swap2(int nkv) { Swap2::Swap2(int nkv) { _nkv = nkv; _nkv = nkv; #ifdef USE_TARGET_OFFLOAD vkv = (double *)aligned_alloc(64, _nkv * sizeof(double)); vec_vkzm = (double *)aligned_alloc(64, _nkv * _nkv * sizeof(double)); #pragma omp parallel for collapse(2) for (int i = 0; i < _nkv; i++) { for (int j = 0; j < _nkv; j++) { vkv[i] = 0.0; vec_vkzm[_nkv * i +j] = 0.0; } } #else vkv = new double[_nkv](); vkv = new double[_nkv](); vec_vkzm = new double[_nkv * _nkv](); vec_vkzm = new double[_nkv * _nkv](); vkzm = new double*[nkv]; #endif // USE TARGET_OFFLOAD for (int vi = 0; vi < _nkv; vi++) vkzm[vi] = vec_vkzm + vi * _nkv; _last_vector = 0; _last_vector = 0; _last_matrix = 0; _last_matrix = 0; } } Swap2::~Swap2() { Swap2::~Swap2() { #ifdef USE_TARGET_OFFLOAD free(vkv); free(vec_vkzm); #else delete[] vkv; delete[] vkv; delete[] vec_vkzm; delete[] vec_vkzm; delete[] vkzm; #endif // USE_TARGET_OFFLOAD } } Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) { Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -298,14 +316,14 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { fstream input; fstream input; Swap2 *instance = NULL; Swap2 *instance = NULL; int fnkv, fnlmmt, fnrvc; int fnkv, fnlmmt, fnrvc; double **fvkzm = NULL; double *fvkzm = NULL; double *fvkv = NULL; double *fvkv = NULL; double value; double value; input.open(file_name.c_str(), ios::in | ios::binary); input.open(file_name.c_str(), ios::in | ios::binary); if (input.is_open()) { if (input.is_open()) { input.read(reinterpret_cast<char *>(&fnkv), sizeof(int)); input.read(reinterpret_cast<char *>(&fnkv), sizeof(int)); instance = new Swap2(fnkv); instance = new Swap2(fnkv); fvkzm = instance->get_matrix(); fvkzm = instance->vec_vkzm; fvkv = instance->get_vector(); fvkv = instance->get_vector(); for (int vj = 0; vj < fnkv; vj++) { for (int vj = 0; vj < fnkv; vj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading @@ -314,7 +332,7 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { for (int mi = 0; mi < fnkv; mi++) { for (int mi = 0; mi < fnkv; mi++) { for (int mj = 0; mj < fnkv; mj++) { for (int mj = 0; mj < fnkv; mj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); input.read(reinterpret_cast<char *>(&value), sizeof(double)); fvkzm[mi][mj] = value; fvkzm[fnkv * mi + mj] = value; } } } } input.read(reinterpret_cast<char *>(&value), sizeof(double)); input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading Loading @@ -359,7 +377,7 @@ long Swap2::get_size(int nkv) { void Swap2::push_matrix(double value) { void Swap2::push_matrix(double value) { int col = _last_matrix % (_nkv - 1); int col = _last_matrix % (_nkv - 1); int row = _last_matrix - (_nkv * row); int row = _last_matrix - (_nkv * row); vkzm[row][col] = value; vec_vkzm[nkv * row + col] = value; _last_matrix++; _last_matrix++; } } Loading Loading @@ -480,7 +498,7 @@ void Swap2::write_legacy(const std::string& file_name) { } } for (int mi = 0; mi < _nkv; mi++) { for (int mi = 0; mi < _nkv; mi++) { for (int mj = 0; mj < _nkv; mj++) { for (int mj = 0; mj < _nkv; mj++) { value = vkzm[mi][mj]; value = vec_vkzm[nkv * mi + mj]; output.write(reinterpret_cast<const char*>(&value), sizeof(double)); output.write(reinterpret_cast<const char*>(&value), sizeof(double)); } } } } Loading Loading @@ -552,8 +570,9 @@ bool Swap2::operator ==(Swap2 &other) { } } } } for (int mi = 0; mi < _nkv; mi++) { for (int mi = 0; mi < _nkv; mi++) { int nkvi = nkv * mi; for (int mj = 0; mj < _nkv; mj++) { for (int mj = 0; mj < _nkv; mj++) { if (vkzm[mi][mj] != other.vkzm[mi][mj]) { if (vec_vkzm[nkvi + mj] != other.vec_vkzm[nkvi + mj]) { return false; return false; } } } } Loading @@ -580,22 +599,33 @@ TFRFME::TFRFME(int lmode, int lm, int nkv, int nxv, int nyv, int nzv) { _exril = 0.0; _exril = 0.0; // Array initialization // Array initialization xv = new double[nxv](); yv = new double[nyv](); zv = new double[nzv](); _nlmmt = _lm * (_lm + 2) * 2; _nlmmt = _lm * (_lm + 2) * 2; _nrvc = _nxv * _nyv * _nzv; _nrvc = _nxv * _nyv * _nzv; vec_wsum = new dcomplex[nrvc * nlmmt](); #ifdef USE_TARGET_OFFLOAD wsum = new dcomplex*[nlmmt]; xv = (double *)aligned_alloc(64, sizeof(double) * _nxv); for (int wi = 0; wi < nlmmt; wi++) wsum[wi] = vec_wsum + wi * nrvc; yv = (double *)aligned_alloc(64, sizeof(double) * _nyv); zv = (double *)aligned_alloc(64, sizeof(double) * _nzv); vec_wsum = (dcomplex *)aligned_alloc(64, sizeof(dcomplex) * _nrvc * _nlmmt); #else xv = new double[_nxv](); yv = new double[_nyv](); zv = new double[_nzv](); vec_wsum = new dcomplex[_nrvc * _nlmmt](); #endif // USE_TARGET_OFFLOAD } } TFRFME::~TFRFME() { TFRFME::~TFRFME() { #ifdef USE_TARGET_OFFLOAD free(xv); free(yv); free(zv); free(vec_wsum); #else delete[] xv; delete[] xv; delete[] yv; delete[] yv; delete[] zv; delete[] zv; delete[] vec_wsum; delete[] vec_wsum; delete[] wsum; #endif } } TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) { TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -662,7 +692,7 @@ TFRFME* TFRFME::from_hdf5(const std::string& file_name) { for (int wj = 0; wj < nrvc; wj++) { for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { for (int wi = 0; wi < nlmmt; wi++) { value = elements[index] + elements[index + 1] * I; value = elements[index] + elements[index + 1] * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; index += 2; index += 2; } // wi loop } // wi loop } // wj loop } // wj loop Loading Loading @@ -727,7 +757,7 @@ TFRFME* TFRFME::from_legacy(const std::string& file_name) { input.read(reinterpret_cast<char *>(&rval), sizeof(double)); input.read(reinterpret_cast<char *>(&rval), sizeof(double)); input.read(reinterpret_cast<char *>(&ival), sizeof(double)); input.read(reinterpret_cast<char *>(&ival), sizeof(double)); dcomplex value = rval + ival * I; dcomplex value = rval + ival * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; } // wi loop } // wi loop } // wj loop } // wj loop input.close(); input.close(); Loading Loading @@ -842,8 +872,8 @@ void TFRFME::write_hdf5(const std::string& file_name) { int index = 0; int index = 0; for (int wj = 0; wj < nrvc; wj++) { for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { for (int wi = 0; wi < nlmmt; wi++) { ptr_elements[index++] = real(wsum[wi][wj]); ptr_elements[index++] = real(vec_wsum[nrvc * wi + wj]); ptr_elements[index++] = imag(wsum[wi][wj]); ptr_elements[index++] = imag(vec_wsum[nrvc * wi + wj]); } // wi loop } // wi loop } // wj loop } // wj loop rec_ptr_list.append(ptr_elements); rec_ptr_list.append(ptr_elements); Loading Loading @@ -891,8 +921,8 @@ void TFRFME::write_legacy(const std::string& file_name) { output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double)); output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double)); for (int wj = 0; wj < _nrvc; wj++) { for (int wj = 0; wj < _nrvc; wj++) { for (int wi = 0; wi < _nlmmt; wi++) { for (int wi = 0; wi < _nlmmt; wi++) { double rval = real(wsum[wi][wj]); double rval = real(vec_wsum[nrvc * wi + wj]); double ival = imag(wsum[wi][wj]); double ival = imag(vec_wsum[nrvc * wi + wj]); output.write(reinterpret_cast<char *>(&rval), sizeof(double)); output.write(reinterpret_cast<char *>(&rval), sizeof(double)); output.write(reinterpret_cast<char *>(&ival), sizeof(double)); output.write(reinterpret_cast<char *>(&ival), sizeof(double)); } // wi loop } // wi loop Loading Loading @@ -962,8 +992,9 @@ bool TFRFME::operator ==(const TFRFME& other) { } } } } for (int wi = 0; wi < _nlmmt; wi++) { for (int wi = 0; wi < _nlmmt; wi++) { int i = _nrvc * wi; for (int wj = 0; wj < _nrvc; wj++) { for (int wj = 0; wj < _nrvc; wj++) { if (wsum[wi][wj] != other.wsum[wi][wj]) { if (vec_wsum[i + wj] != other.vec_wsum[i + wj]) { return false; return false; } } } // wj loop } // wj loop Loading Loading
build/configure.sh +6 −6 Original line number Original line Diff line number Diff line Loading @@ -364,8 +364,8 @@ result=$? if [ "x$result" = "x0" ]; then if [ "x$result" = "x0" ]; then CLANGFLAGS=" -stdlib=libstdc++" CLANGFLAGS=" -stdlib=libstdc++" fi fi echo -n "configure: checking wether $CXX works... " echo -n "configure: checking whether $CXX works... " echo -n "configure: checking wether $CXX works... " >>configure.log echo -n "configure: checking whether $CXX works... " >>configure.log cat > test_compiler.cpp <<EOF cat > test_compiler.cpp <<EOF int main() { int main() { int i = -1; int i = -1; Loading @@ -385,8 +385,8 @@ else echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log exit 2 exit 2 fi fi echo -n "configure: checking wether $CXX supports -ggdb... " echo -n "configure: checking whether $CXX supports -ggdb... " echo -n "configure: checking wether $CXX supports -ggdb... " >>configure.log echo -n "configure: checking whether $CXX supports -ggdb... " >>configure.log $CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log $CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log result=$? result=$? if [ "x$result" = "x0" ]; then if [ "x$result" = "x0" ]; then Loading @@ -410,8 +410,8 @@ else echo "no" echo "no" echo "no" >>configure.log echo "no" >>configure.log fi fi echo -n "configure: checking wether $CXX is a MPI compiler... " echo -n "configure: checking whether $CXX is a MPI compiler... " echo -n "configure: checking wether $CXX is a MPI compiler... " >>configure.log echo -n "configure: checking whether $CXX is a MPI compiler... " >>configure.log cat > test_compiler.cpp <<EOF cat > test_compiler.cpp <<EOF # include <mpi.h> # include <mpi.h> int main() { int main() { Loading
src/include/tfrfme.h +20 −30 Original line number Original line Diff line number Diff line Loading @@ -28,7 +28,7 @@ class Swap1 { protected: protected: //! Index of the last element to be filled. //! Index of the last element to be filled. int _last_index; int _last_index; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave-numbers. int _nkv; int _nkv; //! NLMMT = 2 * LM * (LM + 2) //! NLMMT = 2 * LM * (LM + 2) int _nlmmt; int _nlmmt; Loading Loading @@ -69,7 +69,7 @@ public: /*! \brief Swap1 instance constructor. /*! \brief Swap1 instance constructor. * * * \param lm: `int` Maximum field expansion order. * \param lm: `int` Maximum field expansion order. * \param _nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ */ Swap1(int lm, int nkv); Swap1(int lm, int nkv); Loading Loading @@ -130,10 +130,8 @@ protected: int _last_vector; int _last_vector; //! Index of the last matrix element to be filled. //! Index of the last matrix element to be filled. int _last_matrix; int _last_matrix; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave numbers. int _nkv; int _nkv; //! Contiguous vector of VKZM matrix. double *vec_vkzm; //! QUESTION: definition? //! QUESTION: definition? double _apfafa; double _apfafa; //! QUESTION: definition? //! QUESTION: definition? Loading @@ -152,13 +150,13 @@ protected: double _delxyz; double _delxyz; //! QUESTION: definition? //! QUESTION: definition? double _vknmx; double _vknmx; //! QUESTION: definition? //! Wave number grid spacing. double _delk; double _delk; //! QUESTION: definition? //! Square of wave number grid spacing. double _delks; double _delks; //! NLMMT = LM * (LM + 2) * 2 //! NLMMT = LM * (LM + 2) * 2 int _nlmmt; int _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Number of radial vector coordinates. int _nrvc; int _nrvc; /*! \brief Load a Swap2 instance from a HDF5 binary file. /*! \brief Load a Swap2 instance from a HDF5 binary file. Loading Loading @@ -192,12 +190,12 @@ public: const int &last_vector = _last_vector; const int &last_vector = _last_vector; //! Read-only view on the index of the last matrix element to be filled. //! Read-only view on the index of the last matrix element to be filled. const int &last_matrix = _last_matrix; const int &last_matrix = _last_matrix; //! Read-only view on the number of vector coordinates. QUESTION: correct? //! Read-only view on the number of beam description wave numbers. const int &nkv = _nkv; const int &nkv = _nkv; //! QUESTION: definition? //! QUESTION: definition? double *vkv; double *vkv; //! QUESTION: definition? //! QUESTION: definition? double **vkzm; double *vec_vkzm; //! QUESTION: definition? //! QUESTION: definition? const double &apfafa = _apfafa; const double &apfafa = _apfafa; //! QUESTION: definition? //! QUESTION: definition? Loading @@ -222,12 +220,12 @@ public: const double &delks = _delks; const double &delks = _delks; //! NLMMT = LM * (LM + 2) * 2 //! NLMMT = LM * (LM + 2) * 2 const int &nlmmt = _nlmmt; const int &nlmmt = _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Read-only view on the number of radial vector coordinates. const int &nrvc = _nrvc; const int &nrvc = _nrvc; /*! \brief Swap2 instance constructor. /*! \brief Swap2 instance constructor. * * * \param nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ */ Swap2(int nkv); Swap2(int nkv); Loading @@ -244,15 +242,9 @@ public: */ */ static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY"); static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY"); /*! \brief Get the pointer to the VKZM matrix. * * \return value: `double **` Pointer to the VKZM matrix. */ double **get_matrix() { return vkzm; } /*! \brief Calculate the necessary amount of memory to create a new instance. /*! \brief Calculate the necessary amount of memory to create a new instance. * * * \param nkv: `int` Number of radial vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. * \return size: `long` The necessary memory size in bytes. * \return size: `long` The necessary memory size in bytes. */ */ static long get_size(int nkv); static long get_size(int nkv); Loading Loading @@ -316,11 +308,11 @@ protected: int _nlmmt; int _nlmmt; //! NRVC = NXV * NYV * NZV //! NRVC = NXV * NYV * NZV int _nrvc; int _nrvc; //! Field expansion mode identifier. //! Beam description mode. int _lmode; int _lmode; //! Maximum field expansion order. //! Maximum field expansion order. int _lm; int _lm; //! QUESTION: definition? //! Number of beam description wave numbers. int _nkv; int _nkv; //! Number of computed X coordinates. //! Number of computed X coordinates. int _nxv; int _nxv; Loading @@ -332,11 +324,11 @@ protected: double _vk; double _vk; //! External medium refractive index //! External medium refractive index double _exri; double _exri; //! QUESTION: definition? //! Numerical aperture. double _an; double _an; //! QUESTION: definition? //! Filling factor. double _ff; double _ff; //! QUESTION: definition? //! Lens transmission. double _tra; double _tra; //! QUESTION: definition? //! QUESTION: definition? double _spd; double _spd; Loading @@ -350,8 +342,6 @@ protected: double *yv; double *yv; //! Vector of computed z positions //! Vector of computed z positions double *zv; double *zv; //! QUESTION: definition? dcomplex *vec_wsum; /*! \brief Load a configuration instance from a HDF5 binary file. /*! \brief Load a configuration instance from a HDF5 binary file. * * Loading Loading @@ -402,11 +392,11 @@ public: const double& vk = _vk; const double& vk = _vk; //! Read-only view on external medium refractive index //! Read-only view on external medium refractive index const double& exri = _exri; const double& exri = _exri; //! QUESTION: definition? //! Read-only view on numeric aperture. const double& an = _an; const double& an = _an; //! QUESTION: definition? //! Read-only view on filling factor. const double& ff = _ff; const double& ff = _ff; //! QUESTION: definition? //! Read-only view on lens transmission. const double& tra = _tra; const double& tra = _tra; //! QUESTION: definition? //! QUESTION: definition? const double& spd = _spd; const double& spd = _spd; Loading @@ -415,7 +405,7 @@ public: //! QUESTION: definition? //! QUESTION: definition? const double& exril = _exril; const double& exril = _exril; //! QUESTION: definition? //! QUESTION: definition? dcomplex **wsum; dcomplex *vec_wsum; /*! \brief Trapping configuration instance constructor. /*! \brief Trapping configuration instance constructor. * * Loading
src/libnptm/Configuration.cpp +4 −2 Original line number Original line Diff line number Diff line Loading @@ -340,13 +340,15 @@ GeometryConfiguration* GeometryConfiguration::from_legacy(const std::string& fil // Read optional configuration data used only by the C++ code. // Read optional configuration data used only by the C++ code. while (num_lines > last_read_line) { while (num_lines > last_read_line) { str_target = file_lines[last_read_line++]; str_target = file_lines[last_read_line++]; if (str_target.size() > 0) { if (str_target.size() > 15) { if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) { if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) { regex_search(str_target, m, re); regex_search(str_target, m, re); short refine_flag = (short)stoi(m.str()); short refine_flag = (short)stoi(m.str()); conf->_refine_flag = refine_flag; conf->_refine_flag = refine_flag; } } else if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { } if (str_target.size() > 14) { if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { regex_search(str_target, m, re); regex_search(str_target, m, re); short dyn_order_flag = (short)stoi(m.str()); short dyn_order_flag = (short)stoi(m.str()); conf->_dyn_order_flag = dyn_order_flag; conf->_dyn_order_flag = dyn_order_flag; Loading
src/libnptm/clu_subs.cpp +115 −102 Original line number Original line Diff line number Diff line Loading @@ -47,10 +47,6 @@ #include <omp.h> #include <omp.h> #endif #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; using namespace std; void apc( void apc( Loading Loading @@ -407,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif double cgev(int ipamo, int mu, int l, int m) { double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double result = 0.0; double xd = 0.0, xn = 0.0; double xd = 0.0, xn = 0.0; Loading Loading @@ -443,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) { } } return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif void cms(dcomplex **am, ParticleDescriptor *c1) { void cms(dcomplex **am, ParticleDescriptor *c1) { dcomplex dm, de, cgh, cgk; dcomplex dm, de, cgh, cgk; Loading Loading @@ -649,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; delete[] svs; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif dcomplex ghit_d( dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j ParticleDescriptor *c1, double *rac3j Loading Loading @@ -862,13 +858,13 @@ dcomplex ghit_d( } } return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif dcomplex ghit( dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 ParticleDescriptor *c1 Loading Loading @@ -1079,9 +1075,9 @@ dcomplex ghit( } } return result; return result; } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif void hjv( void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -1339,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("pcros intermediate loop 1"); nvtxRangePush("pcros intermediate loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else // #else // #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #endif #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; // int i = i12 - 1; dcomplex am = cc0; dcomplex am = cc0; Loading Loading @@ -1408,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { csam = -(ccs / (exri * vk)) * 0.5 * I; csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum2 = cc0; sum3 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) // #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else // #else // #pragma omp parallel for simd reduction(+:sum2,sum3) // #endif #pragma omp parallel for simd reduction(+:sum2,sum3) #pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; int ie = i14 + c1->nlem; sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); Loading @@ -1420,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { } // i14 loop } // i14 loop double sumpi = 0.0; double sumpi = 0.0; dcomplex sumpd = cc0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else // #else // #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #endif #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { int je = j16 + c1->nlem; int je = j16 + c1->nlem; Loading Loading @@ -1628,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) { } } } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jmx = j3 + j2; int jdf = j3 - j2; int jdf = j3 - j2; Loading Loading @@ -1748,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { } } } } } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) // #pragma omp begin declare target device_type(any) #endif // #endif void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jmx = j3 + j2; int jdf = j3 - j2; int jdf = j3 - j2; Loading Loading @@ -1872,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { } } } } } } #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target // #pragma omp end declare target #endif // #endif void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; Loading Loading @@ -2005,11 +2004,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba inner loop 1"); nvtxRangePush("raba inner loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) // #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else // #else // #pragma omp parallel for simd reduction(+:c1, c2) // #endif #pragma omp parallel for simd reduction(+:c1, c2) #pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; int j = j10 - 1; c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); Loading @@ -2027,11 +2027,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba outer loop 2"); nvtxRangePush("raba outer loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for // #pragma omp teams distribute parallel for #else // #else // #pragma omp parallel for // #endif #pragma omp parallel for #pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; int jpo = 1 - ipo; ctqce[ipo][0] = cc0; ctqce[ipo][0] = cc0; Loading Loading @@ -2063,11 +2064,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba inner loop 2"); nvtxRangePush("raba inner loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else // #else // #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #endif #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; int im60 = k - (l60*l60) + 1; Loading Loading @@ -2140,11 +2142,12 @@ void raba( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("raba loop 3"); nvtxRangePush("raba loop 3"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd // #pragma omp target teams distribute parallel for simd #else // #else // #pragma omp parallel for simd // #endif #pragma omp parallel for simd #pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; Loading Loading @@ -2214,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr0 inner loop 1"); nvtxRangePush("scr0 inner loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) // #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else // #else // #pragma omp parallel for simd reduction(+:sums, sum21) // #endif #pragma omp parallel for simd reduction(+:sums, sum21) #pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); double fl = 1.0 * (l10 + l10 + 1); // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; Loading Loading @@ -2262,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr0 loop 2"); nvtxRangePush("scr0 loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) // #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else // #else // #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) // #endif #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; scs += c1->sscs[iogi - 1]; Loading Loading @@ -2328,11 +2333,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); nvtxRangePush("scr2 inner loop 1"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; int im10 = k - (l10*l10) + 1; Loading Loading @@ -2384,11 +2390,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); nvtxRangePush("scr2 loop 2"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int i = i14 - 1; int iogi = c1->iog[i14 - 1]; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2418,11 +2425,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); nvtxRangePush("scr2 inner loop 3"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) // #pragma omp target teams distribute parallel for simd collapse(4) #else // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) #pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading @@ -2445,11 +2453,12 @@ void scr2( #ifdef USE_NVTX #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); nvtxRangePush("scr2 loop 4"); #endif #endif #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for collapse(4) // #pragma omp target teams distribute parallel for collapse(4) #else // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) #pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading Loading @@ -2582,11 +2591,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) // #pragma omp target teams distribute parallel for simd collapse(3) #else // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) #pragma omp parallel for simd collapse(3) #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2632,11 +2642,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif #endif dcomplex *am_v = am[0]; dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) // #pragma omp target teams distribute parallel for simd collapse(2) #else // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; dcomplex sum1 = cc0; Loading Loading @@ -2669,11 +2680,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i3 loop } // i1 loop } // i1 loop #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) // #pragma omp target teams distribute parallel for simd collapse(2) #else // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2682,11 +2694,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i0 loop } // i0 loop } // i1 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD // #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) // #pragma omp target teams distribute parallel for simd collapse(2) #else // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #pragma omp parallel for simd collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; int i0e = i0 + c1->nlem; Loading
src/libnptm/tfrfme.cpp +54 −23 Original line number Original line Diff line number Diff line Loading @@ -44,6 +44,10 @@ #include "../include/file_io.h" #include "../include/file_io.h" #endif #endif #ifdef USE_TARGET_OFFLOAD #include <cstdlib> #endif using namespace std; using namespace std; // >>> START OF Swap1 CLASS IMPLEMENTATION <<< // >>> START OF Swap1 CLASS IMPLEMENTATION <<< Loading Loading @@ -220,18 +224,32 @@ bool Swap1::operator ==(Swap1 &other) { // >>> START OF Swap2 CLASS IMPLEMENTATION <<< // >>> START OF Swap2 CLASS IMPLEMENTATION <<< Swap2::Swap2(int nkv) { Swap2::Swap2(int nkv) { _nkv = nkv; _nkv = nkv; #ifdef USE_TARGET_OFFLOAD vkv = (double *)aligned_alloc(64, _nkv * sizeof(double)); vec_vkzm = (double *)aligned_alloc(64, _nkv * _nkv * sizeof(double)); #pragma omp parallel for collapse(2) for (int i = 0; i < _nkv; i++) { for (int j = 0; j < _nkv; j++) { vkv[i] = 0.0; vec_vkzm[_nkv * i +j] = 0.0; } } #else vkv = new double[_nkv](); vkv = new double[_nkv](); vec_vkzm = new double[_nkv * _nkv](); vec_vkzm = new double[_nkv * _nkv](); vkzm = new double*[nkv]; #endif // USE TARGET_OFFLOAD for (int vi = 0; vi < _nkv; vi++) vkzm[vi] = vec_vkzm + vi * _nkv; _last_vector = 0; _last_vector = 0; _last_matrix = 0; _last_matrix = 0; } } Swap2::~Swap2() { Swap2::~Swap2() { #ifdef USE_TARGET_OFFLOAD free(vkv); free(vec_vkzm); #else delete[] vkv; delete[] vkv; delete[] vec_vkzm; delete[] vec_vkzm; delete[] vkzm; #endif // USE_TARGET_OFFLOAD } } Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) { Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -298,14 +316,14 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { fstream input; fstream input; Swap2 *instance = NULL; Swap2 *instance = NULL; int fnkv, fnlmmt, fnrvc; int fnkv, fnlmmt, fnrvc; double **fvkzm = NULL; double *fvkzm = NULL; double *fvkv = NULL; double *fvkv = NULL; double value; double value; input.open(file_name.c_str(), ios::in | ios::binary); input.open(file_name.c_str(), ios::in | ios::binary); if (input.is_open()) { if (input.is_open()) { input.read(reinterpret_cast<char *>(&fnkv), sizeof(int)); input.read(reinterpret_cast<char *>(&fnkv), sizeof(int)); instance = new Swap2(fnkv); instance = new Swap2(fnkv); fvkzm = instance->get_matrix(); fvkzm = instance->vec_vkzm; fvkv = instance->get_vector(); fvkv = instance->get_vector(); for (int vj = 0; vj < fnkv; vj++) { for (int vj = 0; vj < fnkv; vj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading @@ -314,7 +332,7 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { for (int mi = 0; mi < fnkv; mi++) { for (int mi = 0; mi < fnkv; mi++) { for (int mj = 0; mj < fnkv; mj++) { for (int mj = 0; mj < fnkv; mj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); input.read(reinterpret_cast<char *>(&value), sizeof(double)); fvkzm[mi][mj] = value; fvkzm[fnkv * mi + mj] = value; } } } } input.read(reinterpret_cast<char *>(&value), sizeof(double)); input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading Loading @@ -359,7 +377,7 @@ long Swap2::get_size(int nkv) { void Swap2::push_matrix(double value) { void Swap2::push_matrix(double value) { int col = _last_matrix % (_nkv - 1); int col = _last_matrix % (_nkv - 1); int row = _last_matrix - (_nkv * row); int row = _last_matrix - (_nkv * row); vkzm[row][col] = value; vec_vkzm[nkv * row + col] = value; _last_matrix++; _last_matrix++; } } Loading Loading @@ -480,7 +498,7 @@ void Swap2::write_legacy(const std::string& file_name) { } } for (int mi = 0; mi < _nkv; mi++) { for (int mi = 0; mi < _nkv; mi++) { for (int mj = 0; mj < _nkv; mj++) { for (int mj = 0; mj < _nkv; mj++) { value = vkzm[mi][mj]; value = vec_vkzm[nkv * mi + mj]; output.write(reinterpret_cast<const char*>(&value), sizeof(double)); output.write(reinterpret_cast<const char*>(&value), sizeof(double)); } } } } Loading Loading @@ -552,8 +570,9 @@ bool Swap2::operator ==(Swap2 &other) { } } } } for (int mi = 0; mi < _nkv; mi++) { for (int mi = 0; mi < _nkv; mi++) { int nkvi = nkv * mi; for (int mj = 0; mj < _nkv; mj++) { for (int mj = 0; mj < _nkv; mj++) { if (vkzm[mi][mj] != other.vkzm[mi][mj]) { if (vec_vkzm[nkvi + mj] != other.vec_vkzm[nkvi + mj]) { return false; return false; } } } } Loading @@ -580,22 +599,33 @@ TFRFME::TFRFME(int lmode, int lm, int nkv, int nxv, int nyv, int nzv) { _exril = 0.0; _exril = 0.0; // Array initialization // Array initialization xv = new double[nxv](); yv = new double[nyv](); zv = new double[nzv](); _nlmmt = _lm * (_lm + 2) * 2; _nlmmt = _lm * (_lm + 2) * 2; _nrvc = _nxv * _nyv * _nzv; _nrvc = _nxv * _nyv * _nzv; vec_wsum = new dcomplex[nrvc * nlmmt](); #ifdef USE_TARGET_OFFLOAD wsum = new dcomplex*[nlmmt]; xv = (double *)aligned_alloc(64, sizeof(double) * _nxv); for (int wi = 0; wi < nlmmt; wi++) wsum[wi] = vec_wsum + wi * nrvc; yv = (double *)aligned_alloc(64, sizeof(double) * _nyv); zv = (double *)aligned_alloc(64, sizeof(double) * _nzv); vec_wsum = (dcomplex *)aligned_alloc(64, sizeof(dcomplex) * _nrvc * _nlmmt); #else xv = new double[_nxv](); yv = new double[_nyv](); zv = new double[_nzv](); vec_wsum = new dcomplex[_nrvc * _nlmmt](); #endif // USE_TARGET_OFFLOAD } } TFRFME::~TFRFME() { TFRFME::~TFRFME() { #ifdef USE_TARGET_OFFLOAD free(xv); free(yv); free(zv); free(vec_wsum); #else delete[] xv; delete[] xv; delete[] yv; delete[] yv; delete[] zv; delete[] zv; delete[] vec_wsum; delete[] vec_wsum; delete[] wsum; #endif } } TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) { TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -662,7 +692,7 @@ TFRFME* TFRFME::from_hdf5(const std::string& file_name) { for (int wj = 0; wj < nrvc; wj++) { for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { for (int wi = 0; wi < nlmmt; wi++) { value = elements[index] + elements[index + 1] * I; value = elements[index] + elements[index + 1] * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; index += 2; index += 2; } // wi loop } // wi loop } // wj loop } // wj loop Loading Loading @@ -727,7 +757,7 @@ TFRFME* TFRFME::from_legacy(const std::string& file_name) { input.read(reinterpret_cast<char *>(&rval), sizeof(double)); input.read(reinterpret_cast<char *>(&rval), sizeof(double)); input.read(reinterpret_cast<char *>(&ival), sizeof(double)); input.read(reinterpret_cast<char *>(&ival), sizeof(double)); dcomplex value = rval + ival * I; dcomplex value = rval + ival * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; } // wi loop } // wi loop } // wj loop } // wj loop input.close(); input.close(); Loading Loading @@ -842,8 +872,8 @@ void TFRFME::write_hdf5(const std::string& file_name) { int index = 0; int index = 0; for (int wj = 0; wj < nrvc; wj++) { for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { for (int wi = 0; wi < nlmmt; wi++) { ptr_elements[index++] = real(wsum[wi][wj]); ptr_elements[index++] = real(vec_wsum[nrvc * wi + wj]); ptr_elements[index++] = imag(wsum[wi][wj]); ptr_elements[index++] = imag(vec_wsum[nrvc * wi + wj]); } // wi loop } // wi loop } // wj loop } // wj loop rec_ptr_list.append(ptr_elements); rec_ptr_list.append(ptr_elements); Loading Loading @@ -891,8 +921,8 @@ void TFRFME::write_legacy(const std::string& file_name) { output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double)); output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double)); for (int wj = 0; wj < _nrvc; wj++) { for (int wj = 0; wj < _nrvc; wj++) { for (int wi = 0; wi < _nlmmt; wi++) { for (int wi = 0; wi < _nlmmt; wi++) { double rval = real(wsum[wi][wj]); double rval = real(vec_wsum[nrvc * wi + wj]); double ival = imag(wsum[wi][wj]); double ival = imag(vec_wsum[nrvc * wi + wj]); output.write(reinterpret_cast<char *>(&rval), sizeof(double)); output.write(reinterpret_cast<char *>(&rval), sizeof(double)); output.write(reinterpret_cast<char *>(&ival), sizeof(double)); output.write(reinterpret_cast<char *>(&ival), sizeof(double)); } // wi loop } // wi loop Loading Loading @@ -962,8 +992,9 @@ bool TFRFME::operator ==(const TFRFME& other) { } } } } for (int wi = 0; wi < _nlmmt; wi++) { for (int wi = 0; wi < _nlmmt; wi++) { int i = _nrvc * wi; for (int wj = 0; wj < _nrvc; wj++) { for (int wj = 0; wj < _nrvc; wj++) { if (wsum[wi][wj] != other.wsum[wi][wj]) { if (vec_wsum[i + wj] != other.vec_wsum[i + wj]) { return false; return false; } } } // wj loop } // wj loop Loading