Loading build/configure.sh +6 −6 Original line number Diff line number Diff line Loading @@ -364,8 +364,8 @@ result=$? if [ "x$result" = "x0" ]; then CLANGFLAGS=" -stdlib=libstdc++" fi echo -n "configure: checking wether $CXX works... " echo -n "configure: checking wether $CXX works... " >>configure.log echo -n "configure: checking whether $CXX works... " echo -n "configure: checking whether $CXX works... " >>configure.log cat > test_compiler.cpp <<EOF int main() { int i = -1; Loading @@ -385,8 +385,8 @@ else echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log exit 2 fi echo -n "configure: checking wether $CXX supports -ggdb... " echo -n "configure: checking wether $CXX supports -ggdb... " >>configure.log echo -n "configure: checking whether $CXX supports -ggdb... " echo -n "configure: checking whether $CXX supports -ggdb... " >>configure.log $CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log result=$? if [ "x$result" = "x0" ]; then Loading @@ -410,8 +410,8 @@ else echo "no" echo "no" >>configure.log fi echo -n "configure: checking wether $CXX is a MPI compiler... " echo -n "configure: checking wether $CXX is a MPI compiler... " >>configure.log echo -n "configure: checking whether $CXX is a MPI compiler... " echo -n "configure: checking whether $CXX is a MPI compiler... " >>configure.log cat > test_compiler.cpp <<EOF # include <mpi.h> int main() { Loading src/include/tfrfme.h +20 −30 Original line number Diff line number Diff line Loading @@ -28,7 +28,7 @@ class Swap1 { protected: //! Index of the last element to be filled. int _last_index; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave-numbers. int _nkv; //! NLMMT = 2 * LM * (LM + 2) int _nlmmt; Loading Loading @@ -69,7 +69,7 @@ public: /*! \brief Swap1 instance constructor. * * \param lm: `int` Maximum field expansion order. * \param _nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ Swap1(int lm, int nkv); Loading Loading @@ -130,10 +130,8 @@ protected: int _last_vector; //! Index of the last matrix element to be filled. int _last_matrix; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave numbers. int _nkv; //! Contiguous vector of VKZM matrix. double *vec_vkzm; //! QUESTION: definition? double _apfafa; //! QUESTION: definition? Loading @@ -152,13 +150,13 @@ protected: double _delxyz; //! QUESTION: definition? double _vknmx; //! QUESTION: definition? //! Wave number grid spacing. double _delk; //! QUESTION: definition? //! Square of wave number grid spacing. double _delks; //! NLMMT = LM * (LM + 2) * 2 int _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Number of radial vector coordinates. int _nrvc; /*! \brief Load a Swap2 instance from a HDF5 binary file. Loading Loading @@ -192,12 +190,12 @@ public: const int &last_vector = _last_vector; //! Read-only view on the index of the last matrix element to be filled. const int &last_matrix = _last_matrix; //! Read-only view on the number of vector coordinates. QUESTION: correct? //! Read-only view on the number of beam description wave numbers. const int &nkv = _nkv; //! QUESTION: definition? double *vkv; //! QUESTION: definition? double **vkzm; double *vec_vkzm; //! QUESTION: definition? const double &apfafa = _apfafa; //! QUESTION: definition? Loading @@ -222,12 +220,12 @@ public: const double &delks = _delks; //! NLMMT = LM * (LM + 2) * 2 const int &nlmmt = _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Read-only view on the number of radial vector coordinates. const int &nrvc = _nrvc; /*! \brief Swap2 instance constructor. * * \param nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ Swap2(int nkv); Loading @@ -244,15 +242,9 @@ public: */ static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY"); /*! \brief Get the pointer to the VKZM matrix. * * \return value: `double **` Pointer to the VKZM matrix. */ double **get_matrix() { return vkzm; } /*! \brief Calculate the necessary amount of memory to create a new instance. * * \param nkv: `int` Number of radial vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. * \return size: `long` The necessary memory size in bytes. */ static long get_size(int nkv); Loading Loading @@ -316,11 +308,11 @@ protected: int _nlmmt; //! NRVC = NXV * NYV * NZV int _nrvc; //! Field expansion mode identifier. //! Beam description mode. int _lmode; //! Maximum field expansion order. int _lm; //! QUESTION: definition? //! Number of beam description wave numbers. int _nkv; //! Number of computed X coordinates. int _nxv; Loading @@ -332,11 +324,11 @@ protected: double _vk; //! External medium refractive index double _exri; //! QUESTION: definition? //! Numerical aperture. double _an; //! QUESTION: definition? //! Filling factor. double _ff; //! QUESTION: definition? //! Lens transmission. double _tra; //! QUESTION: definition? double _spd; Loading @@ -350,8 +342,6 @@ protected: double *yv; //! Vector of computed z positions double *zv; //! QUESTION: definition? dcomplex *vec_wsum; /*! \brief Load a configuration instance from a HDF5 binary file. * Loading Loading @@ -402,11 +392,11 @@ public: const double& vk = _vk; //! Read-only view on external medium refractive index const double& exri = _exri; //! QUESTION: definition? //! Read-only view on numeric aperture. const double& an = _an; //! QUESTION: definition? //! Read-only view on filling factor. const double& ff = _ff; //! QUESTION: definition? //! Read-only view on lens transmission. const double& tra = _tra; //! QUESTION: definition? const double& spd = _spd; Loading @@ -415,7 +405,7 @@ public: //! QUESTION: definition? const double& exril = _exril; //! QUESTION: definition? dcomplex **wsum; dcomplex *vec_wsum; /*! \brief Trapping configuration instance constructor. * Loading src/libnptm/Configuration.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -340,13 +340,15 @@ GeometryConfiguration* GeometryConfiguration::from_legacy(const std::string& fil // Read optional configuration data used only by the C++ code. while (num_lines > last_read_line) { str_target = file_lines[last_read_line++]; if (str_target.size() > 0) { if (str_target.size() > 15) { if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) { regex_search(str_target, m, re); short refine_flag = (short)stoi(m.str()); conf->_refine_flag = refine_flag; } else if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { } if (str_target.size() > 14) { if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { regex_search(str_target, m, re); short dyn_order_flag = (short)stoi(m.str()); conf->_dyn_order_flag = dyn_order_flag; Loading src/libnptm/clu_subs.cpp +115 −102 Original line number Diff line number Diff line Loading @@ -47,10 +47,6 @@ #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; void apc( Loading Loading @@ -407,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double xd = 0.0, xn = 0.0; Loading Loading @@ -443,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) { } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void cms(dcomplex **am, ParticleDescriptor *c1) { dcomplex dm, de, cgh, cgk; Loading Loading @@ -649,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j Loading Loading @@ -862,13 +858,13 @@ dcomplex ghit_d( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 Loading Loading @@ -1079,9 +1075,9 @@ dcomplex ghit( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -1339,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("pcros intermediate loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #else // #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #endif #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; dcomplex am = cc0; Loading Loading @@ -1408,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) // #else // #pragma omp parallel for simd reduction(+:sum2,sum3) // #endif #pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); Loading @@ -1420,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { } // i14 loop double sumpi = 0.0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #else // #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #endif #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { int je = j16 + c1->nlem; Loading Loading @@ -1628,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) { } } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1748,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1872,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; Loading Loading @@ -2005,11 +2004,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) // #else // #pragma omp parallel for simd reduction(+:c1, c2) // #endif #pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); Loading @@ -2027,11 +2027,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba outer loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp teams distribute parallel for // #else // #pragma omp parallel for // #endif #pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; ctqce[ipo][0] = cc0; Loading Loading @@ -2063,11 +2064,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #else // #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #endif #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; Loading Loading @@ -2140,11 +2142,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #else // #pragma omp parallel for simd // #endif #pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; Loading Loading @@ -2214,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) // #else // #pragma omp parallel for simd reduction(+:sums, sum21) // #endif #pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; Loading Loading @@ -2262,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) // #else // #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) // #endif #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; Loading Loading @@ -2328,11 +2333,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; Loading Loading @@ -2384,11 +2390,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2418,11 +2425,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(4) // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading @@ -2445,11 +2453,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for collapse(4) // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading Loading @@ -2582,11 +2591,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(3) // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2632,11 +2642,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; Loading Loading @@ -2669,11 +2680,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2682,11 +2694,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i0 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; Loading src/libnptm/tfrfme.cpp +54 −23 Original line number Diff line number Diff line Loading @@ -44,6 +44,10 @@ #include "../include/file_io.h" #endif #ifdef USE_TARGET_OFFLOAD #include <cstdlib> #endif using namespace std; // >>> START OF Swap1 CLASS IMPLEMENTATION <<< Loading Loading @@ -220,18 +224,32 @@ bool Swap1::operator ==(Swap1 &other) { // >>> START OF Swap2 CLASS IMPLEMENTATION <<< Swap2::Swap2(int nkv) { _nkv = nkv; #ifdef USE_TARGET_OFFLOAD vkv = (double *)aligned_alloc(64, _nkv * sizeof(double)); vec_vkzm = (double *)aligned_alloc(64, _nkv * _nkv * sizeof(double)); #pragma omp parallel for collapse(2) for (int i = 0; i < _nkv; i++) { for (int j = 0; j < _nkv; j++) { vkv[i] = 0.0; vec_vkzm[_nkv * i +j] = 0.0; } } #else vkv = new double[_nkv](); vec_vkzm = new double[_nkv * _nkv](); vkzm = new double*[nkv]; for (int vi = 0; vi < _nkv; vi++) vkzm[vi] = vec_vkzm + vi * _nkv; #endif // USE TARGET_OFFLOAD _last_vector = 0; _last_matrix = 0; } Swap2::~Swap2() { #ifdef USE_TARGET_OFFLOAD free(vkv); free(vec_vkzm); #else delete[] vkv; delete[] vec_vkzm; delete[] vkzm; #endif // USE_TARGET_OFFLOAD } Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -298,14 +316,14 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { fstream input; Swap2 *instance = NULL; int fnkv, fnlmmt, fnrvc; double **fvkzm = NULL; double *fvkzm = NULL; double *fvkv = NULL; double value; input.open(file_name.c_str(), ios::in | ios::binary); if (input.is_open()) { input.read(reinterpret_cast<char *>(&fnkv), sizeof(int)); instance = new Swap2(fnkv); fvkzm = instance->get_matrix(); fvkzm = instance->vec_vkzm; fvkv = instance->get_vector(); for (int vj = 0; vj < fnkv; vj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading @@ -314,7 +332,7 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { for (int mi = 0; mi < fnkv; mi++) { for (int mj = 0; mj < fnkv; mj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); fvkzm[mi][mj] = value; fvkzm[fnkv * mi + mj] = value; } } input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading Loading @@ -359,7 +377,7 @@ long Swap2::get_size(int nkv) { void Swap2::push_matrix(double value) { int col = _last_matrix % (_nkv - 1); int row = _last_matrix - (_nkv * row); vkzm[row][col] = value; vec_vkzm[nkv * row + col] = value; _last_matrix++; } Loading Loading @@ -480,7 +498,7 @@ void Swap2::write_legacy(const std::string& file_name) { } for (int mi = 0; mi < _nkv; mi++) { for (int mj = 0; mj < _nkv; mj++) { value = vkzm[mi][mj]; value = vec_vkzm[nkv * mi + mj]; output.write(reinterpret_cast<const char*>(&value), sizeof(double)); } } Loading Loading @@ -552,8 +570,9 @@ bool Swap2::operator ==(Swap2 &other) { } } for (int mi = 0; mi < _nkv; mi++) { int nkvi = nkv * mi; for (int mj = 0; mj < _nkv; mj++) { if (vkzm[mi][mj] != other.vkzm[mi][mj]) { if (vec_vkzm[nkvi + mj] != other.vec_vkzm[nkvi + mj]) { return false; } } Loading @@ -580,22 +599,33 @@ TFRFME::TFRFME(int lmode, int lm, int nkv, int nxv, int nyv, int nzv) { _exril = 0.0; // Array initialization xv = new double[nxv](); yv = new double[nyv](); zv = new double[nzv](); _nlmmt = _lm * (_lm + 2) * 2; _nrvc = _nxv * _nyv * _nzv; vec_wsum = new dcomplex[nrvc * nlmmt](); wsum = new dcomplex*[nlmmt]; for (int wi = 0; wi < nlmmt; wi++) wsum[wi] = vec_wsum + wi * nrvc; #ifdef USE_TARGET_OFFLOAD xv = (double *)aligned_alloc(64, sizeof(double) * _nxv); yv = (double *)aligned_alloc(64, sizeof(double) * _nyv); zv = (double *)aligned_alloc(64, sizeof(double) * _nzv); vec_wsum = (dcomplex *)aligned_alloc(64, sizeof(dcomplex) * _nrvc * _nlmmt); #else xv = new double[_nxv](); yv = new double[_nyv](); zv = new double[_nzv](); vec_wsum = new dcomplex[_nrvc * _nlmmt](); #endif // USE_TARGET_OFFLOAD } TFRFME::~TFRFME() { #ifdef USE_TARGET_OFFLOAD free(xv); free(yv); free(zv); free(vec_wsum); #else delete[] xv; delete[] yv; delete[] zv; delete[] vec_wsum; delete[] wsum; #endif } TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -662,7 +692,7 @@ TFRFME* TFRFME::from_hdf5(const std::string& file_name) { for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { value = elements[index] + elements[index + 1] * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; index += 2; } // wi loop } // wj loop Loading Loading @@ -727,7 +757,7 @@ TFRFME* TFRFME::from_legacy(const std::string& file_name) { input.read(reinterpret_cast<char *>(&rval), sizeof(double)); input.read(reinterpret_cast<char *>(&ival), sizeof(double)); dcomplex value = rval + ival * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; } // wi loop } // wj loop input.close(); Loading Loading @@ -842,8 +872,8 @@ void TFRFME::write_hdf5(const std::string& file_name) { int index = 0; for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { ptr_elements[index++] = real(wsum[wi][wj]); ptr_elements[index++] = imag(wsum[wi][wj]); ptr_elements[index++] = real(vec_wsum[nrvc * wi + wj]); ptr_elements[index++] = imag(vec_wsum[nrvc * wi + wj]); } // wi loop } // wj loop rec_ptr_list.append(ptr_elements); Loading Loading @@ -891,8 +921,8 @@ void TFRFME::write_legacy(const std::string& file_name) { output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double)); for (int wj = 0; wj < _nrvc; wj++) { for (int wi = 0; wi < _nlmmt; wi++) { double rval = real(wsum[wi][wj]); double ival = imag(wsum[wi][wj]); double rval = real(vec_wsum[nrvc * wi + wj]); double ival = imag(vec_wsum[nrvc * wi + wj]); output.write(reinterpret_cast<char *>(&rval), sizeof(double)); output.write(reinterpret_cast<char *>(&ival), sizeof(double)); } // wi loop Loading Loading @@ -962,8 +992,9 @@ bool TFRFME::operator ==(const TFRFME& other) { } } for (int wi = 0; wi < _nlmmt; wi++) { int i = _nrvc * wi; for (int wj = 0; wj < _nrvc; wj++) { if (wsum[wi][wj] != other.wsum[wi][wj]) { if (vec_wsum[i + wj] != other.vec_wsum[i + wj]) { return false; } } // wj loop Loading Loading
build/configure.sh +6 −6 Original line number Diff line number Diff line Loading @@ -364,8 +364,8 @@ result=$? if [ "x$result" = "x0" ]; then CLANGFLAGS=" -stdlib=libstdc++" fi echo -n "configure: checking wether $CXX works... " echo -n "configure: checking wether $CXX works... " >>configure.log echo -n "configure: checking whether $CXX works... " echo -n "configure: checking whether $CXX works... " >>configure.log cat > test_compiler.cpp <<EOF int main() { int i = -1; Loading @@ -385,8 +385,8 @@ else echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log exit 2 fi echo -n "configure: checking wether $CXX supports -ggdb... " echo -n "configure: checking wether $CXX supports -ggdb... " >>configure.log echo -n "configure: checking whether $CXX supports -ggdb... " echo -n "configure: checking whether $CXX supports -ggdb... " >>configure.log $CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log result=$? if [ "x$result" = "x0" ]; then Loading @@ -410,8 +410,8 @@ else echo "no" echo "no" >>configure.log fi echo -n "configure: checking wether $CXX is a MPI compiler... " echo -n "configure: checking wether $CXX is a MPI compiler... " >>configure.log echo -n "configure: checking whether $CXX is a MPI compiler... " echo -n "configure: checking whether $CXX is a MPI compiler... " >>configure.log cat > test_compiler.cpp <<EOF # include <mpi.h> int main() { Loading
src/include/tfrfme.h +20 −30 Original line number Diff line number Diff line Loading @@ -28,7 +28,7 @@ class Swap1 { protected: //! Index of the last element to be filled. int _last_index; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave-numbers. int _nkv; //! NLMMT = 2 * LM * (LM + 2) int _nlmmt; Loading Loading @@ -69,7 +69,7 @@ public: /*! \brief Swap1 instance constructor. * * \param lm: `int` Maximum field expansion order. * \param _nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ Swap1(int lm, int nkv); Loading Loading @@ -130,10 +130,8 @@ protected: int _last_vector; //! Index of the last matrix element to be filled. int _last_matrix; //! Number of vector coordinates. QUESTION: correct? //! Number of beam description wave numbers. int _nkv; //! Contiguous vector of VKZM matrix. double *vec_vkzm; //! QUESTION: definition? double _apfafa; //! QUESTION: definition? Loading @@ -152,13 +150,13 @@ protected: double _delxyz; //! QUESTION: definition? double _vknmx; //! QUESTION: definition? //! Wave number grid spacing. double _delk; //! QUESTION: definition? //! Square of wave number grid spacing. double _delks; //! NLMMT = LM * (LM + 2) * 2 int _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Number of radial vector coordinates. int _nrvc; /*! \brief Load a Swap2 instance from a HDF5 binary file. Loading Loading @@ -192,12 +190,12 @@ public: const int &last_vector = _last_vector; //! Read-only view on the index of the last matrix element to be filled. const int &last_matrix = _last_matrix; //! Read-only view on the number of vector coordinates. QUESTION: correct? //! Read-only view on the number of beam description wave numbers. const int &nkv = _nkv; //! QUESTION: definition? double *vkv; //! QUESTION: definition? double **vkzm; double *vec_vkzm; //! QUESTION: definition? const double &apfafa = _apfafa; //! QUESTION: definition? Loading @@ -222,12 +220,12 @@ public: const double &delks = _delks; //! NLMMT = LM * (LM + 2) * 2 const int &nlmmt = _nlmmt; //! Number of radial vector coordinates. QUESTION: correct? //! Read-only view on the number of radial vector coordinates. const int &nrvc = _nrvc; /*! \brief Swap2 instance constructor. * * \param nkv: `int` Number of vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. */ Swap2(int nkv); Loading @@ -244,15 +242,9 @@ public: */ static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY"); /*! \brief Get the pointer to the VKZM matrix. * * \return value: `double **` Pointer to the VKZM matrix. */ double **get_matrix() { return vkzm; } /*! \brief Calculate the necessary amount of memory to create a new instance. * * \param nkv: `int` Number of radial vector coordinates. QUESTION: correct? * \param nkv: `int` Number of beam description wave numbers. * \return size: `long` The necessary memory size in bytes. */ static long get_size(int nkv); Loading Loading @@ -316,11 +308,11 @@ protected: int _nlmmt; //! NRVC = NXV * NYV * NZV int _nrvc; //! Field expansion mode identifier. //! Beam description mode. int _lmode; //! Maximum field expansion order. int _lm; //! QUESTION: definition? //! Number of beam description wave numbers. int _nkv; //! Number of computed X coordinates. int _nxv; Loading @@ -332,11 +324,11 @@ protected: double _vk; //! External medium refractive index double _exri; //! QUESTION: definition? //! Numerical aperture. double _an; //! QUESTION: definition? //! Filling factor. double _ff; //! QUESTION: definition? //! Lens transmission. double _tra; //! QUESTION: definition? double _spd; Loading @@ -350,8 +342,6 @@ protected: double *yv; //! Vector of computed z positions double *zv; //! QUESTION: definition? dcomplex *vec_wsum; /*! \brief Load a configuration instance from a HDF5 binary file. * Loading Loading @@ -402,11 +392,11 @@ public: const double& vk = _vk; //! Read-only view on external medium refractive index const double& exri = _exri; //! QUESTION: definition? //! Read-only view on numeric aperture. const double& an = _an; //! QUESTION: definition? //! Read-only view on filling factor. const double& ff = _ff; //! QUESTION: definition? //! Read-only view on lens transmission. const double& tra = _tra; //! QUESTION: definition? const double& spd = _spd; Loading @@ -415,7 +405,7 @@ public: //! QUESTION: definition? const double& exril = _exril; //! QUESTION: definition? dcomplex **wsum; dcomplex *vec_wsum; /*! \brief Trapping configuration instance constructor. * Loading
src/libnptm/Configuration.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -340,13 +340,15 @@ GeometryConfiguration* GeometryConfiguration::from_legacy(const std::string& fil // Read optional configuration data used only by the C++ code. while (num_lines > last_read_line) { str_target = file_lines[last_read_line++]; if (str_target.size() > 0) { if (str_target.size() > 15) { if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) { regex_search(str_target, m, re); short refine_flag = (short)stoi(m.str()); conf->_refine_flag = refine_flag; } else if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { } if (str_target.size() > 14) { if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) { regex_search(str_target, m, re); short dyn_order_flag = (short)stoi(m.str()); conf->_dyn_order_flag = dyn_order_flag; Loading
src/libnptm/clu_subs.cpp +115 −102 Original line number Diff line number Diff line Loading @@ -47,10 +47,6 @@ #include <omp.h> #endif #ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory #endif using namespace std; void apc( Loading Loading @@ -407,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double xd = 0.0, xn = 0.0; Loading Loading @@ -443,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) { } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void cms(dcomplex **am, ParticleDescriptor *c1) { dcomplex dm, de, cgh, cgk; Loading Loading @@ -649,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) { delete[] svs; } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1, double *rac3j Loading Loading @@ -862,13 +858,13 @@ dcomplex ghit_d( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, ParticleDescriptor *c1 Loading Loading @@ -1079,9 +1075,9 @@ dcomplex ghit( } return result; } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, Loading Loading @@ -1339,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("pcros intermediate loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #else // #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) // #endif #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; dcomplex am = cc0; Loading Loading @@ -1408,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) // #else // #pragma omp parallel for simd reduction(+:sum2,sum3) // #endif #pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); Loading @@ -1420,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { } // i14 loop double sumpi = 0.0; dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #else // #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) // #endif #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { int je = j16 + c1->nlem; Loading Loading @@ -1628,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) { } } #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1748,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif #ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp begin declare target device_type(any) // #endif void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; Loading Loading @@ -1872,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { } } } #ifdef USE_TARGET_OFFLOAD #pragma omp end declare target #endif // #ifdef USE_TARGET_OFFLOAD // #pragma omp end declare target // #endif void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; Loading Loading @@ -2005,11 +2004,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) // #else // #pragma omp parallel for simd reduction(+:c1, c2) // #endif #pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); Loading @@ -2027,11 +2027,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba outer loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp teams distribute parallel for // #else // #pragma omp parallel for // #endif #pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; ctqce[ipo][0] = cc0; Loading Loading @@ -2063,11 +2064,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #else // #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) // #endif #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; Loading Loading @@ -2140,11 +2142,12 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd // #else // #pragma omp parallel for simd // #endif #pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; Loading Loading @@ -2214,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) // #else // #pragma omp parallel for simd reduction(+:sums, sum21) // #endif #pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; Loading Loading @@ -2262,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #ifdef USE_NVTX nvtxRangePush("scr0 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) // #else // #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) // #endif #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; Loading Loading @@ -2328,11 +2333,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 1"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) // #else // #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) // #endif #pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; Loading Loading @@ -2384,11 +2390,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #else // #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) // #endif #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; Loading Loading @@ -2418,11 +2425,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(4) // #else // #pragma omp parallel for simd collapse(4) // #endif #pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading @@ -2445,11 +2453,12 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for collapse(4) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for collapse(4) // #else // #pragma omp parallel for collapse(4) // #endif #pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { Loading Loading @@ -2582,11 +2591,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(3) // #else // #pragma omp parallel for simd collapse(3) // #endif #pragma omp parallel for simd collapse(3) #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { Loading Loading @@ -2632,11 +2642,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; Loading Loading @@ -2669,11 +2680,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; Loading @@ -2682,11 +2694,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i0 loop } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #else // #ifdef USE_TARGET_OFFLOAD // #pragma omp target teams distribute parallel for simd collapse(2) // #else // #pragma omp parallel for simd collapse(2) // #endif #pragma omp parallel for simd collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; Loading
src/libnptm/tfrfme.cpp +54 −23 Original line number Diff line number Diff line Loading @@ -44,6 +44,10 @@ #include "../include/file_io.h" #endif #ifdef USE_TARGET_OFFLOAD #include <cstdlib> #endif using namespace std; // >>> START OF Swap1 CLASS IMPLEMENTATION <<< Loading Loading @@ -220,18 +224,32 @@ bool Swap1::operator ==(Swap1 &other) { // >>> START OF Swap2 CLASS IMPLEMENTATION <<< Swap2::Swap2(int nkv) { _nkv = nkv; #ifdef USE_TARGET_OFFLOAD vkv = (double *)aligned_alloc(64, _nkv * sizeof(double)); vec_vkzm = (double *)aligned_alloc(64, _nkv * _nkv * sizeof(double)); #pragma omp parallel for collapse(2) for (int i = 0; i < _nkv; i++) { for (int j = 0; j < _nkv; j++) { vkv[i] = 0.0; vec_vkzm[_nkv * i +j] = 0.0; } } #else vkv = new double[_nkv](); vec_vkzm = new double[_nkv * _nkv](); vkzm = new double*[nkv]; for (int vi = 0; vi < _nkv; vi++) vkzm[vi] = vec_vkzm + vi * _nkv; #endif // USE TARGET_OFFLOAD _last_vector = 0; _last_matrix = 0; } Swap2::~Swap2() { #ifdef USE_TARGET_OFFLOAD free(vkv); free(vec_vkzm); #else delete[] vkv; delete[] vec_vkzm; delete[] vkzm; #endif // USE_TARGET_OFFLOAD } Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -298,14 +316,14 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { fstream input; Swap2 *instance = NULL; int fnkv, fnlmmt, fnrvc; double **fvkzm = NULL; double *fvkzm = NULL; double *fvkv = NULL; double value; input.open(file_name.c_str(), ios::in | ios::binary); if (input.is_open()) { input.read(reinterpret_cast<char *>(&fnkv), sizeof(int)); instance = new Swap2(fnkv); fvkzm = instance->get_matrix(); fvkzm = instance->vec_vkzm; fvkv = instance->get_vector(); for (int vj = 0; vj < fnkv; vj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading @@ -314,7 +332,7 @@ Swap2* Swap2::from_legacy(const std::string& file_name) { for (int mi = 0; mi < fnkv; mi++) { for (int mj = 0; mj < fnkv; mj++) { input.read(reinterpret_cast<char *>(&value), sizeof(double)); fvkzm[mi][mj] = value; fvkzm[fnkv * mi + mj] = value; } } input.read(reinterpret_cast<char *>(&value), sizeof(double)); Loading Loading @@ -359,7 +377,7 @@ long Swap2::get_size(int nkv) { void Swap2::push_matrix(double value) { int col = _last_matrix % (_nkv - 1); int row = _last_matrix - (_nkv * row); vkzm[row][col] = value; vec_vkzm[nkv * row + col] = value; _last_matrix++; } Loading Loading @@ -480,7 +498,7 @@ void Swap2::write_legacy(const std::string& file_name) { } for (int mi = 0; mi < _nkv; mi++) { for (int mj = 0; mj < _nkv; mj++) { value = vkzm[mi][mj]; value = vec_vkzm[nkv * mi + mj]; output.write(reinterpret_cast<const char*>(&value), sizeof(double)); } } Loading Loading @@ -552,8 +570,9 @@ bool Swap2::operator ==(Swap2 &other) { } } for (int mi = 0; mi < _nkv; mi++) { int nkvi = nkv * mi; for (int mj = 0; mj < _nkv; mj++) { if (vkzm[mi][mj] != other.vkzm[mi][mj]) { if (vec_vkzm[nkvi + mj] != other.vec_vkzm[nkvi + mj]) { return false; } } Loading @@ -580,22 +599,33 @@ TFRFME::TFRFME(int lmode, int lm, int nkv, int nxv, int nyv, int nzv) { _exril = 0.0; // Array initialization xv = new double[nxv](); yv = new double[nyv](); zv = new double[nzv](); _nlmmt = _lm * (_lm + 2) * 2; _nrvc = _nxv * _nyv * _nzv; vec_wsum = new dcomplex[nrvc * nlmmt](); wsum = new dcomplex*[nlmmt]; for (int wi = 0; wi < nlmmt; wi++) wsum[wi] = vec_wsum + wi * nrvc; #ifdef USE_TARGET_OFFLOAD xv = (double *)aligned_alloc(64, sizeof(double) * _nxv); yv = (double *)aligned_alloc(64, sizeof(double) * _nyv); zv = (double *)aligned_alloc(64, sizeof(double) * _nzv); vec_wsum = (dcomplex *)aligned_alloc(64, sizeof(dcomplex) * _nrvc * _nlmmt); #else xv = new double[_nxv](); yv = new double[_nyv](); zv = new double[_nzv](); vec_wsum = new dcomplex[_nrvc * _nlmmt](); #endif // USE_TARGET_OFFLOAD } TFRFME::~TFRFME() { #ifdef USE_TARGET_OFFLOAD free(xv); free(yv); free(zv); free(vec_wsum); #else delete[] xv; delete[] yv; delete[] zv; delete[] vec_wsum; delete[] wsum; #endif } TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) { Loading Loading @@ -662,7 +692,7 @@ TFRFME* TFRFME::from_hdf5(const std::string& file_name) { for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { value = elements[index] + elements[index + 1] * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; index += 2; } // wi loop } // wj loop Loading Loading @@ -727,7 +757,7 @@ TFRFME* TFRFME::from_legacy(const std::string& file_name) { input.read(reinterpret_cast<char *>(&rval), sizeof(double)); input.read(reinterpret_cast<char *>(&ival), sizeof(double)); dcomplex value = rval + ival * I; instance->wsum[wi][wj] = value; instance->vec_wsum[nrvc * wi + wj] = value; } // wi loop } // wj loop input.close(); Loading Loading @@ -842,8 +872,8 @@ void TFRFME::write_hdf5(const std::string& file_name) { int index = 0; for (int wj = 0; wj < nrvc; wj++) { for (int wi = 0; wi < nlmmt; wi++) { ptr_elements[index++] = real(wsum[wi][wj]); ptr_elements[index++] = imag(wsum[wi][wj]); ptr_elements[index++] = real(vec_wsum[nrvc * wi + wj]); ptr_elements[index++] = imag(vec_wsum[nrvc * wi + wj]); } // wi loop } // wj loop rec_ptr_list.append(ptr_elements); Loading Loading @@ -891,8 +921,8 @@ void TFRFME::write_legacy(const std::string& file_name) { output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double)); for (int wj = 0; wj < _nrvc; wj++) { for (int wi = 0; wi < _nlmmt; wi++) { double rval = real(wsum[wi][wj]); double ival = imag(wsum[wi][wj]); double rval = real(vec_wsum[nrvc * wi + wj]); double ival = imag(vec_wsum[nrvc * wi + wj]); output.write(reinterpret_cast<char *>(&rval), sizeof(double)); output.write(reinterpret_cast<char *>(&ival), sizeof(double)); } // wi loop Loading Loading @@ -962,8 +992,9 @@ bool TFRFME::operator ==(const TFRFME& other) { } } for (int wi = 0; wi < _nlmmt; wi++) { int i = _nrvc * wi; for (int wj = 0; wj < _nrvc; wj++) { if (wsum[wi][wj] != other.wsum[wi][wj]) { if (vec_wsum[i + wj] != other.vec_wsum[i + wj]) { return false; } } // wj loop Loading