Loading src/cluster/cluster.cpp +1 −1 Original line number Diff line number Diff line Loading @@ -862,7 +862,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf #ifdef USE_NVTX nvtxRangePush("Average calculation"); #endif ztm(cid->am, cid->c1, cid->c9); ztm(cid->am, cid->c1); #ifdef DEBUG_AM VirtualAsciiFile *outam3 = new VirtualAsciiFile(); string outam3_name = output_path + "/c_AM3_JXI" + to_string(jxi488) + ".txt"; Loading src/include/Commons.h +30 −62 Original line number Diff line number Diff line Loading @@ -38,65 +38,8 @@ #endif class ParticleDescriptor; class mixMPI; /*! \brief Representation of the FORTRAN C9 blocks. */ class C9 { protected: //! \brief Number of rows in the GIS and GLS matrices int gis_size_0; //! \brief Number of rows in the SAM matrix int sam_size_0; public: //! \brief NLEM = LE * (LE + 2) int nlem; //! \brief NLEMT = 2 * LE * (LE + 2) int nlemt; //! \brief QUESTION: definition? dcomplex **gis; //! \brief QUESTION: definition? dcomplex **gls; //! \brief QUESTION: definition? dcomplex **sam; /*! \brief C9 instance constructor. * * \param ndi: `int` NDI = NSPH * LI * (LI + 2) * \param nlem: `int` NLEM = LE * (LE + 2) * \param ndit: `int` NDIT = 2 * NSPH * LI * (LI + 2) * \param nlemt: `int` NLEMT = 2 * LE * (LE + 2) */ C9(int ndi, int nlem, int ndit, int nlemt); /*! \brief C9 instance constructor copying contents from preexisting object. * * \param rhs: `C9` preexisting object to copy from */ C9(const C9& rhs); /*! \brief C9 instance destroyer. */ ~C9(); #ifdef MPI_VERSION /*! \brief C9 instance constructor copying all contents off MPI broadcast from MPI process 0 * * \param mpidata: `mixMPI *` pointer to MPI data structure. */ C9(const mixMPI *mpidata); /*! \brief send C9 instance from MPI process 0 via MPI broadcasts to all other processes * * \param mpidata: `mixMPI *` pointer to MPI data structure. */ void mpibcast(const mixMPI *mpidata); #endif }; /*! \brief structure with essential MPI data. /*! \brief Structure with essential MPI data. */ class mixMPI { public: Loading Loading @@ -133,8 +76,6 @@ class ClusterIterationData { public: //! \brief Pointer to a ParticleDescriptor structure. ParticleDescriptor *c1; //! \brief Pointer to a C9 structure. C9 *c9; //! \brief Vector of geometric asymmetry factors. double *gaps; double **tqse; Loading Loading @@ -264,7 +205,14 @@ protected: // >>> END OF SECTION NEEDED BY SPHERE AND CLUSTER <<< // // >>> NEEDED BY CLUSTER <<< // //! \brief Contiguous space for TSAS. dcomplex *vec_tsas; //! \brief Contiguous space for GIS. dcomplex *vec_gis; //! \brief Contiguous space for GLS. dcomplex *vec_gls; //! \brief Contiguous space for SAM. dcomplex *vec_sam; // >>> END OF SECTION NEEDED BY CLUSTER <<< // // >>> NEEDED BY CLUSTER AND INCLU <<< Loading @@ -272,9 +220,9 @@ protected: int _le; //! \brief Maximum field expansion order. int _lm; //! \brief NLIM = NSPH * LI * (LI + 2) //! \brief NLIM = LI * (LI + 2) int _nlim; //! \brief NLEM = NSPH * LE * (LE + 2) //! \brief NLEM = LE * (LE + 2) int _nlem; //! \brief NLEMT = 2 * NLEM int _nlemt; Loading @@ -292,6 +240,10 @@ protected: int _lmtpos; //! \brief NV3J = (LM * (LM + 1) * (2 * LM + 7)) / 6 int _nv3j; //! \brief NDI = NSPH * NLIM int _ndi; //! \brief NDIT = 2 * NSPH * NLIM int _ndit; //! \brief Contiguous space for AM0M dcomplex *vec_am0m; //! \brief Contiguous space for FSAC Loading @@ -304,6 +256,10 @@ protected: int *vec_ind3j; // >>> END OF SECTION NEEDED BY CLUSTER AND INCLU <<< // // >>> NEEDED BY INCLU <<< // //! \brief NDM = NDIT + NLEMT int _ndm; // >>> END OF SECTION NEEDED BY INCLU <<< // public: // >>> COMMON TO ALL DESCRIPTOR TYPES <<< // //! \brief Base sub-class identification code. Loading Loading @@ -405,6 +361,12 @@ public: double ecs; //! \brief Total absorption cross-section. double acs; //! \brief TBD. dcomplex **gis; //! \brief TBD. dcomplex **gls; //! \brief TBD. dcomplex **sam; // >>> END OF SECTION NEEDED BY CLUSTER <<< // // >>> NEEDED BY CLUSTER AND INCLU <<< Loading Loading @@ -432,6 +394,12 @@ public: const int& lmtpos = _lmtpos; //! \brief Read-only view of NV3J. const int& nv3j = _nv3j; //! \brief Read-only view of NDI. const int& ndi = _ndi; //! \brief Read-only view of NDIT. const int& ndit = _ndit; //! \brief Read-only view of NDM. const int& ndm = _ndm; //! \brief TBD dcomplex *vh; Loading src/include/clu_subs.h +1 −2 Original line number Diff line number Diff line Loading @@ -404,8 +404,7 @@ void tqr( * * \param am: `complex double **` * \param c1: `ParticleDescriptor *` Pointer to a ParticleDescriptor instance. * \param c9: `C9 *` Pointer to a C9 instance. */ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9); void ztm(dcomplex **am, ParticleDescriptor *c1); #endif src/libnptm/Commons.cpp +74 −92 Original line number Diff line number Diff line Loading @@ -36,93 +36,6 @@ #include <mpi.h> #endif C9::C9(int ndi, int _nlem, int ndit, int _nlemt) { gis_size_0 = ndi; sam_size_0 = ndit; nlem = _nlem; nlemt = _nlemt; gis = new dcomplex*[ndi]; gls = new dcomplex*[ndi]; // allocate these arrays to be contiguous, C-style gis[0] = new dcomplex[ndi*nlem](); gls[0] = new dcomplex[ndi*nlem](); for (int gi = 1; gi < ndi; gi++) { gis[gi] = gis[0] + gi*nlem; gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[ndit]; sam[0] = new dcomplex[ndit*nlemt](); for (int si = 1; si < ndit; si++) sam[si] = sam[0] + si*nlemt; } C9::C9(const C9& rhs) { gis_size_0 = rhs.gis_size_0; sam_size_0 = rhs.sam_size_0; nlem = rhs.nlem; nlemt = rhs.nlemt; gis = new dcomplex*[gis_size_0]; gls = new dcomplex*[gis_size_0]; // allocate these arrays to be contiguous, C-style gis[0] = new dcomplex[gis_size_0*nlem](); gls[0] = new dcomplex[gis_size_0*nlem](); memcpy(gis[0], rhs.gis[0], gis_size_0*nlem*sizeof(dcomplex)); memcpy(gls[0], rhs.gls[0], gis_size_0*nlem*sizeof(dcomplex)); for (int gi = 1; gi < gis_size_0; gi++) { gis[gi] = gis[0] + gi*nlem; gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[sam_size_0]; sam[0] = new dcomplex[sam_size_0*nlemt](); memcpy(sam[0], rhs.sam[0], sam_size_0*nlemt*sizeof(dcomplex)); for (int si = 1; si < sam_size_0; si++) { sam[si] = sam[0]+si*nlemt; } } C9::~C9() { delete[] gis[0]; delete[] gls[0]; delete[] gis; delete[] gls; delete[] sam[0]; delete[] sam; } #ifdef MPI_VERSION C9::C9(const mixMPI *mpidata) { MPI_Bcast(&gis_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&sam_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlem, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlemt, 1, MPI_INT, 0, MPI_COMM_WORLD); gis = new dcomplex*[gis_size_0]; gls = new dcomplex*[gis_size_0]; gis[0] = new dcomplex[gis_size_0*nlem](); gls[0] = new dcomplex[gis_size_0*nlem](); MPI_Bcast(gis[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(gls[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); for (int gi = 1; gi < gis_size_0; gi++) { gis[gi] = gis[0] + gi*nlem; gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[sam_size_0]; sam[0] = new dcomplex[sam_size_0*nlemt](); MPI_Bcast(sam[0], sam_size_0*nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); for (int si = 1; si < sam_size_0; si++) { sam[si] = sam[0]+si*nlemt; } } void C9::mpibcast(const mixMPI *mpidata) { MPI_Bcast(&gis_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&sam_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlem, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlemt, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(gis[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(gls[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(sam[0], sam_size_0*nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); } #endif mixMPI::mixMPI() { mpirunning = 0; rank = 0; Loading Loading @@ -152,7 +65,6 @@ ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, Scatter c1 = new ParticleDescriptorCluster(gconf, sconf); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9 = new C9(ndi, c1->nlem, 2 * ndi, 2 * c1->nlem); gaps = new double[c1->nsph](); tqev = new double[3](); tqsv = new double[3](); Loading Loading @@ -254,7 +166,6 @@ ClusterIterationData::ClusterIterationData(const ClusterIterationData& rhs) { c1 = new ParticleDescriptorCluster(reinterpret_cast<ParticleDescriptorCluster &>(*(rhs.c1))); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9 = new C9(*(rhs.c9)); gaps = new double[c1->nsph](); for (int gi = 0; gi < c1->nsph; gi++) gaps[gi] = rhs.gaps[gi]; tqev = new double[3](); Loading Loading @@ -400,7 +311,6 @@ ClusterIterationData::ClusterIterationData(const mixMPI *mpidata, const int devi c1 = new ParticleDescriptorCluster(mpidata); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9 = new C9(mpidata); gaps = new double[c1->nsph](); MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); tqev = new double[3](); Loading Loading @@ -534,7 +444,6 @@ void ClusterIterationData::mpibcast(const mixMPI *mpidata) { c1->mpibcast(mpidata); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9->mpibcast(mpidata); MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(tqev, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(tqsv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); Loading Loading @@ -615,7 +524,6 @@ ClusterIterationData::~ClusterIterationData() { } delete[] zpv; delete c1; delete c9; delete[] gaps; for (int ti = 1; ti > -1; ti--) { delete[] tqse[ti]; Loading Loading @@ -690,6 +598,9 @@ ParticleDescriptor::ParticleDescriptor(GeometryConfiguration *gconf, ScattererCo _lmtpo = 0; _lmtpos = 0; _nv3j = 0; _ndi = 0; _ndit = 0; _ndm = 0; _num_configurations = sconf->configurations; _num_layers = (sconf->use_external_sphere) ? 1 : 0; for (int nli = 0; nli < num_configurations; nli++) _num_layers += sconf->get_nshl(nli); Loading Loading @@ -821,6 +732,9 @@ ParticleDescriptor::ParticleDescriptor(const ParticleDescriptor &rhs) { _lmtpo = rhs._lmtpo; _lmtpos = rhs._lmtpos; _nv3j = rhs._nv3j; _ndi = rhs._ndi; _ndit = rhs._ndit; _ndm = rhs._ndm; _num_configurations = rhs._num_configurations; _num_layers = rhs._num_layers; Loading Loading @@ -904,11 +818,15 @@ ParticleDescriptor::ParticleDescriptor(const ParticleDescriptor &rhs) { // >>> NEEDED BY CLUSTER <<< vintt = NULL; tfsas = 0.0 + I * 0.0; vec_tsas = NULL; tsas = NULL; gcs = 0.0; scs = 0.0; ecs = 0.0; acs = 0.0; vec_gis = NULL; vec_gls = NULL; vec_sam = NULL; // >>> NEEDED BY CLUSTER AND INCLU <<< vec_am0m = NULL; vec_fsac = NULL; Loading Loading @@ -964,6 +882,9 @@ ParticleDescriptor::ParticleDescriptor(const mixMPI *mpidata) { MPI_Bcast(&_lmtpo, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_lmtpos, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_nv3j, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndi, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndit, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndm, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_configurations, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_layers, 1, MPI_INT, 0, MPI_COMM_WORLD); Loading Loading @@ -1039,6 +960,9 @@ void ParticleDescriptor::mpibcast(const mixMPI *mpidata) { MPI_Bcast(&_lmtpo, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_lmtpos, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_nv3j, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndi, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndit, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndm, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_configurations, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_layers, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(vec_rmi, _nsph * _li, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); Loading Loading @@ -1085,6 +1009,9 @@ void ParticleDescriptor::mpibcast(const mixMPI *mpidata) { MPI_Bcast(&scs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&ecs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&acs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(vec_gis, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(vec_gls, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(vec_sam, _ndit * _nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); } // >>> NEEDED BY CLUSTER AND INCLU <<< // if (_class_type == CLUSTER_TYPE || _class_type == INCLUSION_TYPE) { Loading Loading @@ -1211,6 +1138,12 @@ ParticleDescriptor::~ParticleDescriptor() { delete[] vintt; delete[] vec_tsas; delete[] tsas; delete[] vec_gis; delete[] vec_gls; delete[] vec_sam; delete[] gis; delete[] gls; delete[] sam; } } Loading Loading @@ -1260,6 +1193,8 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(GeometryConfiguration *gcon _lmtpo = _li + _le + 1; _lmtpos = _lmtpo * _lmtpo; _nv3j = (_lm * (_lm + 1) * (2 * _lm + 7)) / 6; _ndi = _nsph * _nlim; _ndit = 2 * _nsph * _nlim; vec_am0m = new dcomplex[_nlemt * _nlemt](); vec_fsac = new dcomplex[4](); vec_sac = new dcomplex[4](); Loading Loading @@ -1293,6 +1228,19 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(GeometryConfiguration *gcon ind3j = new int*[_lm + 1]; for (int ii = 0; ii <= _lm; ii++) ind3j[ii] = vec_ind3j + (_lm * ii); rac3j = new double[_lmtpo](); // Needed by CLUSTER vec_gis = new dcomplex[_ndi * _nlem](); gis = new dcomplex*[_ndi]; vec_gls = new dcomplex[_ndi * _nlem](); gls = new dcomplex*[_ndi]; for (int gi = 0; gi < _ndi; gi++) { gis[gi] = vec_gis + (gi * _nlem); gls[gi] = vec_gls + (gi * _nlem); } vec_sam = new dcomplex[_ndit * _nlemt](); sam = new dcomplex*[_ndit]; for (int si = 0; si < _ndit; si++) sam[si] = vec_sam + (si * _nlemt); } ParticleDescriptorCluster::ParticleDescriptorCluster(const ParticleDescriptorCluster &rhs) : ParticleDescriptor(rhs) { Loading Loading @@ -1401,6 +1349,24 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(const ParticleDescriptorClu for (int ii = 0; ii <= _lm; ii++) ind3j[ii] = vec_ind3j + (_lm * ii); rac3j = new double[_lmtpo]; for (int ri = 0; ri < _lmtpo; ri++) rac3j[ri] = rhs.rac3j[ri]; // Needed by CLUSTER vec_gis = new dcomplex[_ndi * _nlem]; vec_gls = new dcomplex[_ndi * _nlem]; for (int vi = 0; vi < _ndi * _nlem; vi++) { vec_gis[vi] = rhs.vec_gis[vi]; vec_gls[vi] = rhs.vec_gls[vi]; } gis = new dcomplex*[_ndi]; gls = new dcomplex*[_ndi]; for (int gi = 0; gi < _ndi; gi++) { gis[gi] = vec_gis + (gi * _nlem); gls[gi] = vec_gls + (gi * _nlem); } vec_sam = new dcomplex[_ndit * _nlemt]; for (int vi = 0; vi < _ndit * _nlemt; vi++) vec_sam[vi] = rhs.vec_sam[vi]; sam = new dcomplex*[_ndit]; for (int si = 0; si < _ndit; si++) sam[si] = vec_sam + (si * _nlemt); } #ifdef MPI_VERSION Loading Loading @@ -1448,6 +1414,22 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(const mixMPI *mpidata) : Pa MPI_Bcast(&scs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&ecs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&acs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); vec_gis = new dcomplex[_ndi * _nlem]; MPI_Bcast(vec_gis, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); vec_gls = new dcomplex[_ndi * _nlem]; MPI_Bcast(vec_gls, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); vec_sam = new dcomplex[_ndit * _nlemt]; MPI_Bcast(vec_sam, _ndit * _nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); gis = new dcomplex*[_ndi]; gls = new dcomplex*[_ndi]; for (int gi = 0; gi < _ndi; gi++) { gis[gi] = vec_gis + (gi * _nlem); gls[gi] = vec_gls + (gi * _nlem); } sam = new dcomplex*[_ndit]; for (int si = 0; si < _ndit; si++) { sam[si] = vec_sam + (si * _nlemt); } // >>> NEEDED BY CLUSTER AND INCLU <<< vec_am0m = new dcomplex[_nlemt * _nlemt]; int nlemts = _nlemt * _nlemt; Loading src/libnptm/clu_subs.cpp +24 −29 Original line number Diff line number Diff line Loading @@ -2533,11 +2533,9 @@ void tqr( tsk = u[0] * tqsv[0] + u[1] * tqsv[1] + u[2] * tqsv[2]; } void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; const np_int ndi = c1->nsph * c1->nlim; np_int ndit = 2 * ndi; // int i2 = 0; // old implementation #ifdef USE_NVTX nvtxRangePush("ZTM starts"); Loading @@ -2546,8 +2544,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { nvtxRangePush("ZTM parallel loop 1"); #endif // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c9->gis[0]; dcomplex *gls_v = c9->gls[0]; dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); int k2max = c1->li*(c1->li+2); int k3max = c1->le*(c1->le+2); Loading Loading @@ -2591,7 +2589,7 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { int m2 = -l2 - 1 + im2; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1)*c9->nlem + i3 - 1; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); } // close k3 loop, former l3 + im3 loops Loading @@ -2605,26 +2603,26 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { nvtxRangePush("ZTM loop 2"); #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c9->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; dcomplex sum2 = cc0; dcomplex sum3 = cc0; dcomplex sum4 = cc0; int i1e = i1 + ndi; int i1e = i1 + c1->ndi; int i3e = i3 + c1->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= ndi; i2++) { int i2e = i2 + ndi; int vecindg_23 = (i2 - 1)*c9->nlem + i3 - 1; for (int i2 = 1; i2 <= c1->ndi; i2++) { int i2e = i2 + c1->ndi; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; dcomplex gie = gis_v[vecindg_23]; dcomplex gle = gls_v[vecindg_23]; np_int vecinda_1 = (i1 - 1)*ndit; np_int vecinda_1e = (i1 - 1 + ndi)*ndit; np_int vecinda_1 = (i1 - 1) * c1->ndit; np_int vecinda_1e = (i1 - 1 + c1->ndi) * c1->ndit; dcomplex a1 = am_v[vecinda_1 + i2 - 1]; dcomplex a2 = am_v[vecinda_1 + i2e - 1]; dcomplex a3 = am_v[vecinda_1e + i2 - 1]; Loading @@ -2634,8 +2632,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { sum3 += (a3 * gie + a4 * gle); sum4 += (a3 * gle + a4 * gie); } // i2 loop int vecind1 = (i1 - 1)*c9->nlemt; int vecind1e = (i1e - 1)*c9->nlemt; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; sam_v[vecind1 + i3 - 1] = sum1; sam_v[vecind1 + i3e - 1] = sum2; sam_v[vecind1e + i3 - 1] = sum3; Loading @@ -2643,39 +2641,36 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { } // i3 loop } // i1 loop #pragma omp parallel for collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1)*c9->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gis_v[vecindex] = dconjg(gis_v[vecindex]); gls_v[vecindex] = dconjg(gls_v[vecindex]); // c9->gis[i1 - 1][i0 - 1] = dconjg(c9->gis[i1 - 1][i0 - 1]); // c9->gls[i1 - 1][i0 - 1] = dconjg(c9->gls[i1 - 1][i0 - 1]); } // i0 loop } // i1 loop int nlemt = c1->nlem + c1->nlem; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; dcomplex sum1 = cc0; dcomplex sum2 = cc0; for (int i1 = 1; i1 <= ndi; i1 ++) { int i1e = i1 + ndi; int vecind1 = (i1 - 1)*c9->nlemt; int vecind1e = (i1e - 1)*c9->nlemt; for (int i1 = 1; i1 <= c1->ndi; i1 ++) { int i1e = i1 + c1->ndi; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; a1 = sam_v[vecind1 + i3 - 1]; a2 = sam_v[vecind1e + i3 - 1]; int vecindex = (i1 - 1)*c9->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gie = gis_v[vecindex]; gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); } // i1 loop int vecind0 = (i0 - 1)*nlemt; int vecind0e = (i0e - 1)*nlemt; int vecind0 = (i0 - 1) * c1->nlemt; int vecind0e = (i0e - 1) * c1->nlemt; vec_am0m[vecind0 + i3 - 1] = -sum1; vec_am0m[vecind0e + i3 - 1] = -sum2; // c1->am0m[i0 - 1][i3 - 1] = -sum1; Loading Loading
src/cluster/cluster.cpp +1 −1 Original line number Diff line number Diff line Loading @@ -862,7 +862,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf #ifdef USE_NVTX nvtxRangePush("Average calculation"); #endif ztm(cid->am, cid->c1, cid->c9); ztm(cid->am, cid->c1); #ifdef DEBUG_AM VirtualAsciiFile *outam3 = new VirtualAsciiFile(); string outam3_name = output_path + "/c_AM3_JXI" + to_string(jxi488) + ".txt"; Loading
src/include/Commons.h +30 −62 Original line number Diff line number Diff line Loading @@ -38,65 +38,8 @@ #endif class ParticleDescriptor; class mixMPI; /*! \brief Representation of the FORTRAN C9 blocks. */ class C9 { protected: //! \brief Number of rows in the GIS and GLS matrices int gis_size_0; //! \brief Number of rows in the SAM matrix int sam_size_0; public: //! \brief NLEM = LE * (LE + 2) int nlem; //! \brief NLEMT = 2 * LE * (LE + 2) int nlemt; //! \brief QUESTION: definition? dcomplex **gis; //! \brief QUESTION: definition? dcomplex **gls; //! \brief QUESTION: definition? dcomplex **sam; /*! \brief C9 instance constructor. * * \param ndi: `int` NDI = NSPH * LI * (LI + 2) * \param nlem: `int` NLEM = LE * (LE + 2) * \param ndit: `int` NDIT = 2 * NSPH * LI * (LI + 2) * \param nlemt: `int` NLEMT = 2 * LE * (LE + 2) */ C9(int ndi, int nlem, int ndit, int nlemt); /*! \brief C9 instance constructor copying contents from preexisting object. * * \param rhs: `C9` preexisting object to copy from */ C9(const C9& rhs); /*! \brief C9 instance destroyer. */ ~C9(); #ifdef MPI_VERSION /*! \brief C9 instance constructor copying all contents off MPI broadcast from MPI process 0 * * \param mpidata: `mixMPI *` pointer to MPI data structure. */ C9(const mixMPI *mpidata); /*! \brief send C9 instance from MPI process 0 via MPI broadcasts to all other processes * * \param mpidata: `mixMPI *` pointer to MPI data structure. */ void mpibcast(const mixMPI *mpidata); #endif }; /*! \brief structure with essential MPI data. /*! \brief Structure with essential MPI data. */ class mixMPI { public: Loading Loading @@ -133,8 +76,6 @@ class ClusterIterationData { public: //! \brief Pointer to a ParticleDescriptor structure. ParticleDescriptor *c1; //! \brief Pointer to a C9 structure. C9 *c9; //! \brief Vector of geometric asymmetry factors. double *gaps; double **tqse; Loading Loading @@ -264,7 +205,14 @@ protected: // >>> END OF SECTION NEEDED BY SPHERE AND CLUSTER <<< // // >>> NEEDED BY CLUSTER <<< // //! \brief Contiguous space for TSAS. dcomplex *vec_tsas; //! \brief Contiguous space for GIS. dcomplex *vec_gis; //! \brief Contiguous space for GLS. dcomplex *vec_gls; //! \brief Contiguous space for SAM. dcomplex *vec_sam; // >>> END OF SECTION NEEDED BY CLUSTER <<< // // >>> NEEDED BY CLUSTER AND INCLU <<< Loading @@ -272,9 +220,9 @@ protected: int _le; //! \brief Maximum field expansion order. int _lm; //! \brief NLIM = NSPH * LI * (LI + 2) //! \brief NLIM = LI * (LI + 2) int _nlim; //! \brief NLEM = NSPH * LE * (LE + 2) //! \brief NLEM = LE * (LE + 2) int _nlem; //! \brief NLEMT = 2 * NLEM int _nlemt; Loading @@ -292,6 +240,10 @@ protected: int _lmtpos; //! \brief NV3J = (LM * (LM + 1) * (2 * LM + 7)) / 6 int _nv3j; //! \brief NDI = NSPH * NLIM int _ndi; //! \brief NDIT = 2 * NSPH * NLIM int _ndit; //! \brief Contiguous space for AM0M dcomplex *vec_am0m; //! \brief Contiguous space for FSAC Loading @@ -304,6 +256,10 @@ protected: int *vec_ind3j; // >>> END OF SECTION NEEDED BY CLUSTER AND INCLU <<< // // >>> NEEDED BY INCLU <<< // //! \brief NDM = NDIT + NLEMT int _ndm; // >>> END OF SECTION NEEDED BY INCLU <<< // public: // >>> COMMON TO ALL DESCRIPTOR TYPES <<< // //! \brief Base sub-class identification code. Loading Loading @@ -405,6 +361,12 @@ public: double ecs; //! \brief Total absorption cross-section. double acs; //! \brief TBD. dcomplex **gis; //! \brief TBD. dcomplex **gls; //! \brief TBD. dcomplex **sam; // >>> END OF SECTION NEEDED BY CLUSTER <<< // // >>> NEEDED BY CLUSTER AND INCLU <<< Loading Loading @@ -432,6 +394,12 @@ public: const int& lmtpos = _lmtpos; //! \brief Read-only view of NV3J. const int& nv3j = _nv3j; //! \brief Read-only view of NDI. const int& ndi = _ndi; //! \brief Read-only view of NDIT. const int& ndit = _ndit; //! \brief Read-only view of NDM. const int& ndm = _ndm; //! \brief TBD dcomplex *vh; Loading
src/include/clu_subs.h +1 −2 Original line number Diff line number Diff line Loading @@ -404,8 +404,7 @@ void tqr( * * \param am: `complex double **` * \param c1: `ParticleDescriptor *` Pointer to a ParticleDescriptor instance. * \param c9: `C9 *` Pointer to a C9 instance. */ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9); void ztm(dcomplex **am, ParticleDescriptor *c1); #endif
src/libnptm/Commons.cpp +74 −92 Original line number Diff line number Diff line Loading @@ -36,93 +36,6 @@ #include <mpi.h> #endif C9::C9(int ndi, int _nlem, int ndit, int _nlemt) { gis_size_0 = ndi; sam_size_0 = ndit; nlem = _nlem; nlemt = _nlemt; gis = new dcomplex*[ndi]; gls = new dcomplex*[ndi]; // allocate these arrays to be contiguous, C-style gis[0] = new dcomplex[ndi*nlem](); gls[0] = new dcomplex[ndi*nlem](); for (int gi = 1; gi < ndi; gi++) { gis[gi] = gis[0] + gi*nlem; gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[ndit]; sam[0] = new dcomplex[ndit*nlemt](); for (int si = 1; si < ndit; si++) sam[si] = sam[0] + si*nlemt; } C9::C9(const C9& rhs) { gis_size_0 = rhs.gis_size_0; sam_size_0 = rhs.sam_size_0; nlem = rhs.nlem; nlemt = rhs.nlemt; gis = new dcomplex*[gis_size_0]; gls = new dcomplex*[gis_size_0]; // allocate these arrays to be contiguous, C-style gis[0] = new dcomplex[gis_size_0*nlem](); gls[0] = new dcomplex[gis_size_0*nlem](); memcpy(gis[0], rhs.gis[0], gis_size_0*nlem*sizeof(dcomplex)); memcpy(gls[0], rhs.gls[0], gis_size_0*nlem*sizeof(dcomplex)); for (int gi = 1; gi < gis_size_0; gi++) { gis[gi] = gis[0] + gi*nlem; gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[sam_size_0]; sam[0] = new dcomplex[sam_size_0*nlemt](); memcpy(sam[0], rhs.sam[0], sam_size_0*nlemt*sizeof(dcomplex)); for (int si = 1; si < sam_size_0; si++) { sam[si] = sam[0]+si*nlemt; } } C9::~C9() { delete[] gis[0]; delete[] gls[0]; delete[] gis; delete[] gls; delete[] sam[0]; delete[] sam; } #ifdef MPI_VERSION C9::C9(const mixMPI *mpidata) { MPI_Bcast(&gis_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&sam_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlem, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlemt, 1, MPI_INT, 0, MPI_COMM_WORLD); gis = new dcomplex*[gis_size_0]; gls = new dcomplex*[gis_size_0]; gis[0] = new dcomplex[gis_size_0*nlem](); gls[0] = new dcomplex[gis_size_0*nlem](); MPI_Bcast(gis[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(gls[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); for (int gi = 1; gi < gis_size_0; gi++) { gis[gi] = gis[0] + gi*nlem; gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[sam_size_0]; sam[0] = new dcomplex[sam_size_0*nlemt](); MPI_Bcast(sam[0], sam_size_0*nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); for (int si = 1; si < sam_size_0; si++) { sam[si] = sam[0]+si*nlemt; } } void C9::mpibcast(const mixMPI *mpidata) { MPI_Bcast(&gis_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&sam_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlem, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlemt, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(gis[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(gls[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(sam[0], sam_size_0*nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); } #endif mixMPI::mixMPI() { mpirunning = 0; rank = 0; Loading Loading @@ -152,7 +65,6 @@ ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, Scatter c1 = new ParticleDescriptorCluster(gconf, sconf); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9 = new C9(ndi, c1->nlem, 2 * ndi, 2 * c1->nlem); gaps = new double[c1->nsph](); tqev = new double[3](); tqsv = new double[3](); Loading Loading @@ -254,7 +166,6 @@ ClusterIterationData::ClusterIterationData(const ClusterIterationData& rhs) { c1 = new ParticleDescriptorCluster(reinterpret_cast<ParticleDescriptorCluster &>(*(rhs.c1))); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9 = new C9(*(rhs.c9)); gaps = new double[c1->nsph](); for (int gi = 0; gi < c1->nsph; gi++) gaps[gi] = rhs.gaps[gi]; tqev = new double[3](); Loading Loading @@ -400,7 +311,6 @@ ClusterIterationData::ClusterIterationData(const mixMPI *mpidata, const int devi c1 = new ParticleDescriptorCluster(mpidata); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9 = new C9(mpidata); gaps = new double[c1->nsph](); MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); tqev = new double[3](); Loading Loading @@ -534,7 +444,6 @@ void ClusterIterationData::mpibcast(const mixMPI *mpidata) { c1->mpibcast(mpidata); const int ndi = c1->nsph * c1->nlim; const np_int ndit = 2 * ndi; c9->mpibcast(mpidata); MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(tqev, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(tqsv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); Loading Loading @@ -615,7 +524,6 @@ ClusterIterationData::~ClusterIterationData() { } delete[] zpv; delete c1; delete c9; delete[] gaps; for (int ti = 1; ti > -1; ti--) { delete[] tqse[ti]; Loading Loading @@ -690,6 +598,9 @@ ParticleDescriptor::ParticleDescriptor(GeometryConfiguration *gconf, ScattererCo _lmtpo = 0; _lmtpos = 0; _nv3j = 0; _ndi = 0; _ndit = 0; _ndm = 0; _num_configurations = sconf->configurations; _num_layers = (sconf->use_external_sphere) ? 1 : 0; for (int nli = 0; nli < num_configurations; nli++) _num_layers += sconf->get_nshl(nli); Loading Loading @@ -821,6 +732,9 @@ ParticleDescriptor::ParticleDescriptor(const ParticleDescriptor &rhs) { _lmtpo = rhs._lmtpo; _lmtpos = rhs._lmtpos; _nv3j = rhs._nv3j; _ndi = rhs._ndi; _ndit = rhs._ndit; _ndm = rhs._ndm; _num_configurations = rhs._num_configurations; _num_layers = rhs._num_layers; Loading Loading @@ -904,11 +818,15 @@ ParticleDescriptor::ParticleDescriptor(const ParticleDescriptor &rhs) { // >>> NEEDED BY CLUSTER <<< vintt = NULL; tfsas = 0.0 + I * 0.0; vec_tsas = NULL; tsas = NULL; gcs = 0.0; scs = 0.0; ecs = 0.0; acs = 0.0; vec_gis = NULL; vec_gls = NULL; vec_sam = NULL; // >>> NEEDED BY CLUSTER AND INCLU <<< vec_am0m = NULL; vec_fsac = NULL; Loading Loading @@ -964,6 +882,9 @@ ParticleDescriptor::ParticleDescriptor(const mixMPI *mpidata) { MPI_Bcast(&_lmtpo, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_lmtpos, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_nv3j, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndi, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndit, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndm, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_configurations, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_layers, 1, MPI_INT, 0, MPI_COMM_WORLD); Loading Loading @@ -1039,6 +960,9 @@ void ParticleDescriptor::mpibcast(const mixMPI *mpidata) { MPI_Bcast(&_lmtpo, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_lmtpos, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_nv3j, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndi, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndit, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_ndm, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_configurations, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&_num_layers, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(vec_rmi, _nsph * _li, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); Loading Loading @@ -1085,6 +1009,9 @@ void ParticleDescriptor::mpibcast(const mixMPI *mpidata) { MPI_Bcast(&scs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&ecs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&acs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(vec_gis, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(vec_gls, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); MPI_Bcast(vec_sam, _ndit * _nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); } // >>> NEEDED BY CLUSTER AND INCLU <<< // if (_class_type == CLUSTER_TYPE || _class_type == INCLUSION_TYPE) { Loading Loading @@ -1211,6 +1138,12 @@ ParticleDescriptor::~ParticleDescriptor() { delete[] vintt; delete[] vec_tsas; delete[] tsas; delete[] vec_gis; delete[] vec_gls; delete[] vec_sam; delete[] gis; delete[] gls; delete[] sam; } } Loading Loading @@ -1260,6 +1193,8 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(GeometryConfiguration *gcon _lmtpo = _li + _le + 1; _lmtpos = _lmtpo * _lmtpo; _nv3j = (_lm * (_lm + 1) * (2 * _lm + 7)) / 6; _ndi = _nsph * _nlim; _ndit = 2 * _nsph * _nlim; vec_am0m = new dcomplex[_nlemt * _nlemt](); vec_fsac = new dcomplex[4](); vec_sac = new dcomplex[4](); Loading Loading @@ -1293,6 +1228,19 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(GeometryConfiguration *gcon ind3j = new int*[_lm + 1]; for (int ii = 0; ii <= _lm; ii++) ind3j[ii] = vec_ind3j + (_lm * ii); rac3j = new double[_lmtpo](); // Needed by CLUSTER vec_gis = new dcomplex[_ndi * _nlem](); gis = new dcomplex*[_ndi]; vec_gls = new dcomplex[_ndi * _nlem](); gls = new dcomplex*[_ndi]; for (int gi = 0; gi < _ndi; gi++) { gis[gi] = vec_gis + (gi * _nlem); gls[gi] = vec_gls + (gi * _nlem); } vec_sam = new dcomplex[_ndit * _nlemt](); sam = new dcomplex*[_ndit]; for (int si = 0; si < _ndit; si++) sam[si] = vec_sam + (si * _nlemt); } ParticleDescriptorCluster::ParticleDescriptorCluster(const ParticleDescriptorCluster &rhs) : ParticleDescriptor(rhs) { Loading Loading @@ -1401,6 +1349,24 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(const ParticleDescriptorClu for (int ii = 0; ii <= _lm; ii++) ind3j[ii] = vec_ind3j + (_lm * ii); rac3j = new double[_lmtpo]; for (int ri = 0; ri < _lmtpo; ri++) rac3j[ri] = rhs.rac3j[ri]; // Needed by CLUSTER vec_gis = new dcomplex[_ndi * _nlem]; vec_gls = new dcomplex[_ndi * _nlem]; for (int vi = 0; vi < _ndi * _nlem; vi++) { vec_gis[vi] = rhs.vec_gis[vi]; vec_gls[vi] = rhs.vec_gls[vi]; } gis = new dcomplex*[_ndi]; gls = new dcomplex*[_ndi]; for (int gi = 0; gi < _ndi; gi++) { gis[gi] = vec_gis + (gi * _nlem); gls[gi] = vec_gls + (gi * _nlem); } vec_sam = new dcomplex[_ndit * _nlemt]; for (int vi = 0; vi < _ndit * _nlemt; vi++) vec_sam[vi] = rhs.vec_sam[vi]; sam = new dcomplex*[_ndit]; for (int si = 0; si < _ndit; si++) sam[si] = vec_sam + (si * _nlemt); } #ifdef MPI_VERSION Loading Loading @@ -1448,6 +1414,22 @@ ParticleDescriptorCluster::ParticleDescriptorCluster(const mixMPI *mpidata) : Pa MPI_Bcast(&scs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&ecs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(&acs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); vec_gis = new dcomplex[_ndi * _nlem]; MPI_Bcast(vec_gis, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); vec_gls = new dcomplex[_ndi * _nlem]; MPI_Bcast(vec_gls, _ndi * _nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); vec_sam = new dcomplex[_ndit * _nlemt]; MPI_Bcast(vec_sam, _ndit * _nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); gis = new dcomplex*[_ndi]; gls = new dcomplex*[_ndi]; for (int gi = 0; gi < _ndi; gi++) { gis[gi] = vec_gis + (gi * _nlem); gls[gi] = vec_gls + (gi * _nlem); } sam = new dcomplex*[_ndit]; for (int si = 0; si < _ndit; si++) { sam[si] = vec_sam + (si * _nlemt); } // >>> NEEDED BY CLUSTER AND INCLU <<< vec_am0m = new dcomplex[_nlemt * _nlemt]; int nlemts = _nlemt * _nlemt; Loading
src/libnptm/clu_subs.cpp +24 −29 Original line number Diff line number Diff line Loading @@ -2533,11 +2533,9 @@ void tqr( tsk = u[0] * tqsv[0] + u[1] * tqsv[1] + u[2] * tqsv[2]; } void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; const np_int ndi = c1->nsph * c1->nlim; np_int ndit = 2 * ndi; // int i2 = 0; // old implementation #ifdef USE_NVTX nvtxRangePush("ZTM starts"); Loading @@ -2546,8 +2544,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { nvtxRangePush("ZTM parallel loop 1"); #endif // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c9->gis[0]; dcomplex *gls_v = c9->gls[0]; dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); int k2max = c1->li*(c1->li+2); int k3max = c1->le*(c1->le+2); Loading Loading @@ -2591,7 +2589,7 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { int m2 = -l2 - 1 + im2; int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1)*c9->nlem + i3 - 1; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); } // close k3 loop, former l3 + im3 loops Loading @@ -2605,26 +2603,26 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { nvtxRangePush("ZTM loop 2"); #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c9->sam[0]; dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable? for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { dcomplex sum1 = cc0; dcomplex sum2 = cc0; dcomplex sum3 = cc0; dcomplex sum4 = cc0; int i1e = i1 + ndi; int i1e = i1 + c1->ndi; int i3e = i3 + c1->nlem; #pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= ndi; i2++) { int i2e = i2 + ndi; int vecindg_23 = (i2 - 1)*c9->nlem + i3 - 1; for (int i2 = 1; i2 <= c1->ndi; i2++) { int i2e = i2 + c1->ndi; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; dcomplex gie = gis_v[vecindg_23]; dcomplex gle = gls_v[vecindg_23]; np_int vecinda_1 = (i1 - 1)*ndit; np_int vecinda_1e = (i1 - 1 + ndi)*ndit; np_int vecinda_1 = (i1 - 1) * c1->ndit; np_int vecinda_1e = (i1 - 1 + c1->ndi) * c1->ndit; dcomplex a1 = am_v[vecinda_1 + i2 - 1]; dcomplex a2 = am_v[vecinda_1 + i2e - 1]; dcomplex a3 = am_v[vecinda_1e + i2 - 1]; Loading @@ -2634,8 +2632,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { sum3 += (a3 * gie + a4 * gle); sum4 += (a3 * gle + a4 * gie); } // i2 loop int vecind1 = (i1 - 1)*c9->nlemt; int vecind1e = (i1e - 1)*c9->nlemt; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; sam_v[vecind1 + i3 - 1] = sum1; sam_v[vecind1 + i3e - 1] = sum2; sam_v[vecind1e + i3 - 1] = sum3; Loading @@ -2643,39 +2641,36 @@ void ztm(dcomplex **am, ParticleDescriptor *c1, C9 * c9) { } // i3 loop } // i1 loop #pragma omp parallel for collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1)*c9->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gis_v[vecindex] = dconjg(gis_v[vecindex]); gls_v[vecindex] = dconjg(gls_v[vecindex]); // c9->gis[i1 - 1][i0 - 1] = dconjg(c9->gis[i1 - 1][i0 - 1]); // c9->gls[i1 - 1][i0 - 1] = dconjg(c9->gls[i1 - 1][i0 - 1]); } // i0 loop } // i1 loop int nlemt = c1->nlem + c1->nlem; dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= nlemt; i3++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { int i0e = i0 + c1->nlem; dcomplex sum1 = cc0; dcomplex sum2 = cc0; for (int i1 = 1; i1 <= ndi; i1 ++) { int i1e = i1 + ndi; int vecind1 = (i1 - 1)*c9->nlemt; int vecind1e = (i1e - 1)*c9->nlemt; for (int i1 = 1; i1 <= c1->ndi; i1 ++) { int i1e = i1 + c1->ndi; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; a1 = sam_v[vecind1 + i3 - 1]; a2 = sam_v[vecind1e + i3 - 1]; int vecindex = (i1 - 1)*c9->nlem + i0 - 1; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; gie = gis_v[vecindex]; gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); } // i1 loop int vecind0 = (i0 - 1)*nlemt; int vecind0e = (i0e - 1)*nlemt; int vecind0 = (i0 - 1) * c1->nlemt; int vecind0e = (i0e - 1) * c1->nlemt; vec_am0m[vecind0 + i3 - 1] = -sum1; vec_am0m[vecind0e + i3 - 1] = -sum2; // c1->am0m[i0 - 1][i3 - 1] = -sum1; Loading