Loading src/cluster/cluster.cpp +7 −3 Original line number Diff line number Diff line Loading @@ -919,11 +919,15 @@ int cluster_jxi488_cycle( outam0->write_to_disk(outam0_name); delete outam0; #endif // DEBUG_AM #ifdef USE_OFFLOAD #ifdef USE_TARGET_OFFLOAD if (rs.use_offload) { cms_gpu(cid->am, cid->c1); } else { cms(cid->am, cid->c1); } #else cms(cid->am, cid->c1); #endif // USE_OFFLOAD #endif // USE_TARGET_OFFLOAD #ifdef DEBUG_AM VirtualAsciiFile *outam1 = new VirtualAsciiFile(); string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt"; Loading src/include/Configuration.h +4 −0 Original line number Diff line number Diff line Loading @@ -345,6 +345,8 @@ protected: int _max_ref_iters; //! \brief Refinement flag. bool _use_refinement; //! \brief Offload flag. bool _use_offload; public: //! \brief Flag for LU factorization inversion. Loading @@ -368,6 +370,8 @@ public: const int& max_ref_iters = _max_ref_iters; //! \brief Read-only view on refinement flag. const bool& use_refinement = _use_refinement; //! \brief Read-only view on offload flag. const bool& use_offload = _use_offload; //! \brief Pointer to a Logger instance. const Logger *logger; Loading src/include/clu_subs.h +3 −3 Original line number Diff line number Diff line Loading @@ -93,7 +93,6 @@ dcomplex cdtp(dcomplex z, dcomplex *vec_am, int i, int jf, int k, int nj, np_int */ double cgev(int ipamo, int mu, int l, int m); #ifndef USE_OFFLOAD /*! \brief Build the multi-centered M-matrix of the cluster. * * This function constructs the multi-centered M-matrix of the cluster, according Loading @@ -103,7 +102,8 @@ double cgev(int ipamo, int mu, int l, int m); * \param c1: `ParticleDescriptor *` */ void cms(dcomplex **am, ParticleDescriptor *c1); #else #ifdef USE_TARGET_OFFLOAD /*! \brief Build the multi-centered M-matrix of the cluster. * * This function constructs the multi-centered M-matrix of the cluster, according Loading @@ -113,7 +113,7 @@ void cms(dcomplex **am, ParticleDescriptor *c1); * \param c1: `ParticleDescriptor *` */ void cms_gpu(dcomplex **am, ParticleDescriptor *c1); #endif // USE_OFFLOAD #endif // USE_TARGET_OFFLOAD /*! \brief Compute orientation-averaged scattered field intensity. * Loading src/libnptm/Configuration.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -461,16 +461,18 @@ RuntimeSettings::RuntimeSettings() { _host_ram_gb = 0.0; _max_ref_iters = 5; _use_refinement = false; _use_offload = true; logger = NULL; } RuntimeSettings::RuntimeSettings(GeometryConfiguration *gconf, Logger *ptr_logger) { _invert_mode = gconf->invert_mode; _accuracy_goal = gconf->accuracy_goal; // TODO: make this option configurable. _accuracy_goal = gconf->accuracy_goal; _gpu_ram_gb = gconf->gpu_ram_gb; _host_ram_gb = gconf->host_ram_gb; _max_ref_iters = gconf->ref_iters; // TODO: make this option configurable. _max_ref_iters = gconf->ref_iters; _use_refinement = gconf->refine_flag; _use_offload = true; // TODO: make this option configurable. logger = ptr_logger; } // >>> END OF RuntimeSettings CLASS IMPLEMENTATION <<< Loading src/libnptm/clu_subs.cpp +134 −87 Original line number Diff line number Diff line Loading @@ -436,20 +436,22 @@ double cgev(int ipamo, int mu, int l, int m) { // #pragma omp end declare target // #endif #ifndef USE_OFFLOAD void cms(dcomplex **am, ParticleDescriptor *c1) { const dcomplex cc0 = 0.0 + 0.0 * I; int ndi = c1->nsph * c1->nlim; const int nsph = c1->nsph; const int nlim = c1->nlim; const int li = c1->li; const int ndi = nsph * nlim; const int nsphmo = nsph - 1; // int nbl = 0; int nsphmo = c1->nsph - 1; for (int n1 = 1; n1 <= nsphmo; n1++) { // GPU portable? int in1 = (n1 - 1) * c1->nlim; for (int n1 = 1; n1 <= nsphmo; n1++) { int in1 = (n1 - 1) * nlim; int n1po = n1 + 1; for (int n2 = n1po; n2 <= c1->nsph; n2++) { int in2 = (n2 - 1) * c1->nlim; for (int n2 = n1po; n2 <= nsph; n2++) { int in2 = (n2 - 1) * nlim; // nbl++; int nbl = ((n1 - 1) * (2 * c1->nsph - n1)) / 2 + n2 - n1; for (int l1 = 1; l1 <= c1->li; l1++) { int nbl = ((n1 - 1) * (2 * nsph - n1)) / 2 + n2 - n1; for (int l1 = 1; l1 <= li; l1++) { int l1po = l1 + 1; int il1 = l1po * l1; int l1tpo = l1po + l1; Loading @@ -461,12 +463,12 @@ void cms(dcomplex **am, ParticleDescriptor *c1) { int i1e = in1 + ilm1e; int j1 = in2 + ilm1; int j1e = in2 + ilm1e; for (int l2 = 1; l2 <= c1->li; l2++) { for (int l2 = 1; l2 <= li; l2++) { int l2po = l2 + 1; int il2 = l2po * l2; int l2tpo = l2po + l2; int ish = ((l2 + l1) % 2 == 0) ? 1 : -1; int isk = -ish; double rsh = ((l2 + l1) % 2 == 0) ? 1.0 : -1.0; double rsk = -rsh; for (int im2 = 1; im2 <= l2tpo; im2++) { int m2 = im2 - l2po; int ilm2 = il2 + m2; Loading @@ -481,20 +483,20 @@ void cms(dcomplex **am, ParticleDescriptor *c1) { am[i1 - 1][i2e - 1] = cgk; am[i1e - 1][i2 - 1] = cgk; am[i1e - 1][i2e - 1] = cgh; am[j1 - 1][j2 - 1] = cgh * (1.0 * ish); am[j1 - 1][j2e - 1] = cgk * (1.0 * isk); am[j1e - 1][j2 - 1] = cgk * (1.0 * isk); am[j1e - 1][j2e - 1] = cgh * (1.0 * ish); am[j1 - 1][j2 - 1] = cgh * rsh; am[j1 - 1][j2e - 1] = cgk * rsk; am[j1e - 1][j2 - 1] = cgk * rsk; am[j1e - 1][j2e - 1] = cgh * rsh; } // im2 loop } // l2 loop } // im1 loop } // l1 loop } // n2 loop } // n1 loop #pragma omp parallel for for (int n1 = 1; n1 <= c1->nsph; n1++) { // GPU portable? int in1 = (n1 - 1) * c1->nlim; for (int l1 = 1; l1 <= c1->li; l1++) { #pragma omp parallel for collapse(2) for (int n1 = 1; n1 <= nsph; n1++) { // GPU portable? for (int l1 = 1; l1 <= li; l1++) { int in1 = (n1 - 1) * nlim; dcomplex dm = c1->rmi[l1 - 1][n1 - 1]; dcomplex de = c1->rei[l1 - 1][n1 - 1]; int l1po = l1 + 1; Loading @@ -520,70 +522,111 @@ void cms(dcomplex **am, ParticleDescriptor *c1) { } // l1 loop } // n1 loop } #else #ifdef USE_TARGET_OFFLOAD void cms_gpu(dcomplex **am, ParticleDescriptor *c1) { const dcomplex cc0 = 0.0 + 0.0 * I; int ndi = c1->nsph * c1->nlim; // int nbl = 0; int nsphmo = c1->nsph - 1; for (int n1 = 1; n1 <= nsphmo; n1++) { // GPU portable? int in1 = (n1 - 1) * c1->nlim; int n1po = n1 + 1; for (int n2 = n1po; n2 <= c1->nsph; n2++) { int in2 = (n2 - 1) * c1->nlim; // nbl++; int nbl = ((n1 - 1) * (2 * c1->nsph - n1)) / 2 + n2 - n1; for (int l1 = 1; l1 <= c1->li; l1++) { const int nsph = c1->nsph; const int nlim = c1->nlim; const int li = c1->li; const int ndi = nsph * nlim; const int max_litpo = 2 * li + 1; const int nsphmo = nsph - 1; const np_int num_pairs = (nsph * (nsph - 1)) / 2; const np_int total_iters = num_pairs * li * max_litpo * li * max_litpo; const dcomplex cc0 = 0.0 + I * 0.0; // Prepare and fill look-up tables int *lut_n1 = new int[num_pairs]; int *lut_n2 = new int[num_pairs]; for (int n1 = 1; n1 <= nsphmo; n1++) { for (int n2 = n1 + 1; n2 <= nsph; n2++) { int nbl = ((n1 - 1) * (2 * nsph - n1)) / 2 + n2 - n1; lut_n1[nbl - 1] = n1; lut_n2[nbl - 1] = n2; } } #pragma omp parallel for for (np_int iter = 0; iter < total_iters; ++iter) { np_int t = iter; // Index calculation (from innermost to outermost) int im2 = (t % max_litpo) + 1; t /= max_litpo; int l2 = (t % li) + 1; t /= li; int im1 = (t % max_litpo) + 1; t /= max_litpo; int l1 = (t % li) + 1; t /= li; int nbl_idx = (t % num_pairs); // 0-based for look-up tables // Look-up values int n1 = lut_n1[nbl_idx]; int n2 = lut_n2[nbl_idx]; int nbl = nbl_idx + 1; // Ranges of magnetic quantum numbers int l1po = l1 + 1; int il1 = l1po * l1; int l1tpo = l1po + l1; for (int im1 = 1; im1 <= l1tpo; im1++) { int m1 = im1 - l1po; int ilm1 = il1 + m1; int ilm1e = ilm1 + ndi; int i1 = in1 + ilm1; int i1e = in1 + ilm1e; int j1 = in2 + ilm1; int j1e = in2 + ilm1e; for (int l2 = 1; l2 <= c1->li; l2++) { int l2po = l2 + 1; int l1tpo = 2 * l1 + 1; int l2tpo = 2 * l2 + 1; int il1 = l1po * l1; int in1 = (n1 - 1) * nlim; int in2 = (n2 - 1) * nlim; int il2 = l2po * l2; int l2tpo = l2po + l2; int ish = ((l2 + l1) % 2 == 0) ? 1 : -1; int isk = -ish; for (int im2 = 1; im2 <= l2tpo; im2++) { int m2 = im2 - l2po; int ilm2 = il2 + m2; int ilm2e = ilm2 + ndi; int i2 = in2 + ilm2; int i2e = in2 + ilm2e; int j2 = in1 + ilm2; int j2e = in1 + ilm2e; dcomplex cgh = ghit(0, 0, nbl, l1, m1, l2, m2, c1); dcomplex cgk = ghit(0, 1, nbl, l1, m1, l2, m2, c1); double rsh = ((l2 + l1) % 2 == 0) ? 1.0 : -1.0; double rsk = -rsh; bool is_valid_iter = (im1 <= l1tpo && im2 <= l2tpo); // Index 1 magnetic quantum numbers int m1 = (is_valid_iter) ? im1 - l1po : 0; int ilm1 = (is_valid_iter) ? il1 + m1 : 0; int ilm1e = (is_valid_iter) ? ilm1 + ndi : 0; int i1 = (is_valid_iter) ? in1 + ilm1 : 0; int i1e = (is_valid_iter) ? in1 + ilm1e : 0; int j1 = (is_valid_iter) ? in2 + ilm1 : 0; int j1e = (is_valid_iter) ? in2 + ilm1e : 0; // End of index 1 magnetic quantum numbers // Index 2 magnetic quantum numbers int m2 = (is_valid_iter) ? im2 - l2po : 0; int ilm2 = (is_valid_iter) ? il2 + m2 : 0; int ilm2e = (is_valid_iter) ? ilm2 + ndi : 0; int i2 = (is_valid_iter) ? in2 + ilm2 : 0; int i2e = (is_valid_iter) ? in2 + ilm2e : 0; int j2 = (is_valid_iter) ? in1 + ilm2 : 0; int j2e = (is_valid_iter) ? in1 + ilm2e : 0; // End of index 2 magnetic quantum numbers dcomplex cgh, cgk; #pragma omp critical { cgh = (is_valid_iter) ? ghit(0, 0, nbl, l1, m1, l2, m2, c1) : cc0; cgk = (is_valid_iter) ? ghit(0, 1, nbl, l1, m1, l2, m2, c1) : cc0; } if (is_valid_iter) { am[i1 - 1][i2 - 1] = cgh; am[i1 - 1][i2e - 1] = cgk; am[i1e - 1][i2 - 1] = cgk; am[i1e - 1][i2e - 1] = cgh; am[j1 - 1][j2 - 1] = cgh * (1.0 * ish); am[j1 - 1][j2e - 1] = cgk * (1.0 * isk); am[j1e - 1][j2 - 1] = cgk * (1.0 * isk); am[j1e - 1][j2e - 1] = cgh * (1.0 * ish); } // im2 loop } // l2 loop } // im1 loop } // l1 loop } // n2 loop } // n1 loop am[j1 - 1][j2 - 1] = cgh * rsh; am[j1 - 1][j2e - 1] = cgk * rsk; am[j1e - 1][j2 - 1] = cgk * rsk; am[j1e - 1][j2e - 1] = cgh * rsh; } } delete[] lut_n1; delete[] lut_n2; int max_l1tpo = 2 * c1->li + 1; np_int diag_iters = c1->li * max_l1tpo * c1->nsph; np_int diag_iters = li * max_litpo * nsph; #pragma omp parallel for collapse(3) for (int n1 = 1; n1 <= c1->nsph; n1++) { for (int l1 = 1; l1 <= c1->li; l1++) { // GPU portable for (int im1 = 1; im1 <= max_l1tpo; im1++) { dcomplex scratch_e, scratch_m; int in1 = (n1 - 1) * c1->nlim; for (int n1 = 1; n1 <= nsph; n1++) { for (int l1 = 1; l1 <= li; l1++) { // GPU portable for (int im1 = 1; im1 <= max_litpo; im1++) { dcomplex result_e, result_m; int in1 = (n1 - 1) * nlim; dcomplex dm = c1->rmi[l1 - 1][n1 - 1]; dcomplex de = c1->rei[l1 - 1][n1 - 1]; int l1po = l1 + 1; Loading @@ -593,15 +636,19 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1) { int ilm1 = (im1 <= l1tpo) ? il1 + m1 : 0; int i1 = (im1 <= l1tpo) ? in1 + ilm1 : 0; int i1e = (im1 <= l1tpo) ? i1 + ndi : 0; dcomplex *target_m = (im1 <= l1tpo) ? (am[i1 - 1] + i1 - 1) : &scratch_m; dcomplex *target_e = (im1 <= l1tpo) ? (am[i1e - 1] + i1e - 1) : &scratch_e; *target_m = dm; *target_e = de; // dcomplex *target_m = (im1 <= l1tpo) ? (am[i1 - 1] + i1 - 1) : &scratch_m; // dcomplex *target_e = (im1 <= l1tpo) ? (am[i1e - 1] + i1e - 1) : &scratch_e; result_m = dm; result_e = de; if (im1 <= l1tpo) { am[i1 - 1][i1 -1] = result_m; am[i1e - 1][i1e -1] = result_e; } } // im1 loop } // l1 loop } // n1 loop } #endif // USE_OFFLOAD #endif // USE_TARGET_OFFLOAD void crsm1(double vk, double exri, ParticleDescriptor *c1) { dcomplex ***svf, ***svw, **svs; Loading Loading
src/cluster/cluster.cpp +7 −3 Original line number Diff line number Diff line Loading @@ -919,11 +919,15 @@ int cluster_jxi488_cycle( outam0->write_to_disk(outam0_name); delete outam0; #endif // DEBUG_AM #ifdef USE_OFFLOAD #ifdef USE_TARGET_OFFLOAD if (rs.use_offload) { cms_gpu(cid->am, cid->c1); } else { cms(cid->am, cid->c1); } #else cms(cid->am, cid->c1); #endif // USE_OFFLOAD #endif // USE_TARGET_OFFLOAD #ifdef DEBUG_AM VirtualAsciiFile *outam1 = new VirtualAsciiFile(); string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt"; Loading
src/include/Configuration.h +4 −0 Original line number Diff line number Diff line Loading @@ -345,6 +345,8 @@ protected: int _max_ref_iters; //! \brief Refinement flag. bool _use_refinement; //! \brief Offload flag. bool _use_offload; public: //! \brief Flag for LU factorization inversion. Loading @@ -368,6 +370,8 @@ public: const int& max_ref_iters = _max_ref_iters; //! \brief Read-only view on refinement flag. const bool& use_refinement = _use_refinement; //! \brief Read-only view on offload flag. const bool& use_offload = _use_offload; //! \brief Pointer to a Logger instance. const Logger *logger; Loading
src/include/clu_subs.h +3 −3 Original line number Diff line number Diff line Loading @@ -93,7 +93,6 @@ dcomplex cdtp(dcomplex z, dcomplex *vec_am, int i, int jf, int k, int nj, np_int */ double cgev(int ipamo, int mu, int l, int m); #ifndef USE_OFFLOAD /*! \brief Build the multi-centered M-matrix of the cluster. * * This function constructs the multi-centered M-matrix of the cluster, according Loading @@ -103,7 +102,8 @@ double cgev(int ipamo, int mu, int l, int m); * \param c1: `ParticleDescriptor *` */ void cms(dcomplex **am, ParticleDescriptor *c1); #else #ifdef USE_TARGET_OFFLOAD /*! \brief Build the multi-centered M-matrix of the cluster. * * This function constructs the multi-centered M-matrix of the cluster, according Loading @@ -113,7 +113,7 @@ void cms(dcomplex **am, ParticleDescriptor *c1); * \param c1: `ParticleDescriptor *` */ void cms_gpu(dcomplex **am, ParticleDescriptor *c1); #endif // USE_OFFLOAD #endif // USE_TARGET_OFFLOAD /*! \brief Compute orientation-averaged scattered field intensity. * Loading
src/libnptm/Configuration.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -461,16 +461,18 @@ RuntimeSettings::RuntimeSettings() { _host_ram_gb = 0.0; _max_ref_iters = 5; _use_refinement = false; _use_offload = true; logger = NULL; } RuntimeSettings::RuntimeSettings(GeometryConfiguration *gconf, Logger *ptr_logger) { _invert_mode = gconf->invert_mode; _accuracy_goal = gconf->accuracy_goal; // TODO: make this option configurable. _accuracy_goal = gconf->accuracy_goal; _gpu_ram_gb = gconf->gpu_ram_gb; _host_ram_gb = gconf->host_ram_gb; _max_ref_iters = gconf->ref_iters; // TODO: make this option configurable. _max_ref_iters = gconf->ref_iters; _use_refinement = gconf->refine_flag; _use_offload = true; // TODO: make this option configurable. logger = ptr_logger; } // >>> END OF RuntimeSettings CLASS IMPLEMENTATION <<< Loading
src/libnptm/clu_subs.cpp +134 −87 Original line number Diff line number Diff line Loading @@ -436,20 +436,22 @@ double cgev(int ipamo, int mu, int l, int m) { // #pragma omp end declare target // #endif #ifndef USE_OFFLOAD void cms(dcomplex **am, ParticleDescriptor *c1) { const dcomplex cc0 = 0.0 + 0.0 * I; int ndi = c1->nsph * c1->nlim; const int nsph = c1->nsph; const int nlim = c1->nlim; const int li = c1->li; const int ndi = nsph * nlim; const int nsphmo = nsph - 1; // int nbl = 0; int nsphmo = c1->nsph - 1; for (int n1 = 1; n1 <= nsphmo; n1++) { // GPU portable? int in1 = (n1 - 1) * c1->nlim; for (int n1 = 1; n1 <= nsphmo; n1++) { int in1 = (n1 - 1) * nlim; int n1po = n1 + 1; for (int n2 = n1po; n2 <= c1->nsph; n2++) { int in2 = (n2 - 1) * c1->nlim; for (int n2 = n1po; n2 <= nsph; n2++) { int in2 = (n2 - 1) * nlim; // nbl++; int nbl = ((n1 - 1) * (2 * c1->nsph - n1)) / 2 + n2 - n1; for (int l1 = 1; l1 <= c1->li; l1++) { int nbl = ((n1 - 1) * (2 * nsph - n1)) / 2 + n2 - n1; for (int l1 = 1; l1 <= li; l1++) { int l1po = l1 + 1; int il1 = l1po * l1; int l1tpo = l1po + l1; Loading @@ -461,12 +463,12 @@ void cms(dcomplex **am, ParticleDescriptor *c1) { int i1e = in1 + ilm1e; int j1 = in2 + ilm1; int j1e = in2 + ilm1e; for (int l2 = 1; l2 <= c1->li; l2++) { for (int l2 = 1; l2 <= li; l2++) { int l2po = l2 + 1; int il2 = l2po * l2; int l2tpo = l2po + l2; int ish = ((l2 + l1) % 2 == 0) ? 1 : -1; int isk = -ish; double rsh = ((l2 + l1) % 2 == 0) ? 1.0 : -1.0; double rsk = -rsh; for (int im2 = 1; im2 <= l2tpo; im2++) { int m2 = im2 - l2po; int ilm2 = il2 + m2; Loading @@ -481,20 +483,20 @@ void cms(dcomplex **am, ParticleDescriptor *c1) { am[i1 - 1][i2e - 1] = cgk; am[i1e - 1][i2 - 1] = cgk; am[i1e - 1][i2e - 1] = cgh; am[j1 - 1][j2 - 1] = cgh * (1.0 * ish); am[j1 - 1][j2e - 1] = cgk * (1.0 * isk); am[j1e - 1][j2 - 1] = cgk * (1.0 * isk); am[j1e - 1][j2e - 1] = cgh * (1.0 * ish); am[j1 - 1][j2 - 1] = cgh * rsh; am[j1 - 1][j2e - 1] = cgk * rsk; am[j1e - 1][j2 - 1] = cgk * rsk; am[j1e - 1][j2e - 1] = cgh * rsh; } // im2 loop } // l2 loop } // im1 loop } // l1 loop } // n2 loop } // n1 loop #pragma omp parallel for for (int n1 = 1; n1 <= c1->nsph; n1++) { // GPU portable? int in1 = (n1 - 1) * c1->nlim; for (int l1 = 1; l1 <= c1->li; l1++) { #pragma omp parallel for collapse(2) for (int n1 = 1; n1 <= nsph; n1++) { // GPU portable? for (int l1 = 1; l1 <= li; l1++) { int in1 = (n1 - 1) * nlim; dcomplex dm = c1->rmi[l1 - 1][n1 - 1]; dcomplex de = c1->rei[l1 - 1][n1 - 1]; int l1po = l1 + 1; Loading @@ -520,70 +522,111 @@ void cms(dcomplex **am, ParticleDescriptor *c1) { } // l1 loop } // n1 loop } #else #ifdef USE_TARGET_OFFLOAD void cms_gpu(dcomplex **am, ParticleDescriptor *c1) { const dcomplex cc0 = 0.0 + 0.0 * I; int ndi = c1->nsph * c1->nlim; // int nbl = 0; int nsphmo = c1->nsph - 1; for (int n1 = 1; n1 <= nsphmo; n1++) { // GPU portable? int in1 = (n1 - 1) * c1->nlim; int n1po = n1 + 1; for (int n2 = n1po; n2 <= c1->nsph; n2++) { int in2 = (n2 - 1) * c1->nlim; // nbl++; int nbl = ((n1 - 1) * (2 * c1->nsph - n1)) / 2 + n2 - n1; for (int l1 = 1; l1 <= c1->li; l1++) { const int nsph = c1->nsph; const int nlim = c1->nlim; const int li = c1->li; const int ndi = nsph * nlim; const int max_litpo = 2 * li + 1; const int nsphmo = nsph - 1; const np_int num_pairs = (nsph * (nsph - 1)) / 2; const np_int total_iters = num_pairs * li * max_litpo * li * max_litpo; const dcomplex cc0 = 0.0 + I * 0.0; // Prepare and fill look-up tables int *lut_n1 = new int[num_pairs]; int *lut_n2 = new int[num_pairs]; for (int n1 = 1; n1 <= nsphmo; n1++) { for (int n2 = n1 + 1; n2 <= nsph; n2++) { int nbl = ((n1 - 1) * (2 * nsph - n1)) / 2 + n2 - n1; lut_n1[nbl - 1] = n1; lut_n2[nbl - 1] = n2; } } #pragma omp parallel for for (np_int iter = 0; iter < total_iters; ++iter) { np_int t = iter; // Index calculation (from innermost to outermost) int im2 = (t % max_litpo) + 1; t /= max_litpo; int l2 = (t % li) + 1; t /= li; int im1 = (t % max_litpo) + 1; t /= max_litpo; int l1 = (t % li) + 1; t /= li; int nbl_idx = (t % num_pairs); // 0-based for look-up tables // Look-up values int n1 = lut_n1[nbl_idx]; int n2 = lut_n2[nbl_idx]; int nbl = nbl_idx + 1; // Ranges of magnetic quantum numbers int l1po = l1 + 1; int il1 = l1po * l1; int l1tpo = l1po + l1; for (int im1 = 1; im1 <= l1tpo; im1++) { int m1 = im1 - l1po; int ilm1 = il1 + m1; int ilm1e = ilm1 + ndi; int i1 = in1 + ilm1; int i1e = in1 + ilm1e; int j1 = in2 + ilm1; int j1e = in2 + ilm1e; for (int l2 = 1; l2 <= c1->li; l2++) { int l2po = l2 + 1; int l1tpo = 2 * l1 + 1; int l2tpo = 2 * l2 + 1; int il1 = l1po * l1; int in1 = (n1 - 1) * nlim; int in2 = (n2 - 1) * nlim; int il2 = l2po * l2; int l2tpo = l2po + l2; int ish = ((l2 + l1) % 2 == 0) ? 1 : -1; int isk = -ish; for (int im2 = 1; im2 <= l2tpo; im2++) { int m2 = im2 - l2po; int ilm2 = il2 + m2; int ilm2e = ilm2 + ndi; int i2 = in2 + ilm2; int i2e = in2 + ilm2e; int j2 = in1 + ilm2; int j2e = in1 + ilm2e; dcomplex cgh = ghit(0, 0, nbl, l1, m1, l2, m2, c1); dcomplex cgk = ghit(0, 1, nbl, l1, m1, l2, m2, c1); double rsh = ((l2 + l1) % 2 == 0) ? 1.0 : -1.0; double rsk = -rsh; bool is_valid_iter = (im1 <= l1tpo && im2 <= l2tpo); // Index 1 magnetic quantum numbers int m1 = (is_valid_iter) ? im1 - l1po : 0; int ilm1 = (is_valid_iter) ? il1 + m1 : 0; int ilm1e = (is_valid_iter) ? ilm1 + ndi : 0; int i1 = (is_valid_iter) ? in1 + ilm1 : 0; int i1e = (is_valid_iter) ? in1 + ilm1e : 0; int j1 = (is_valid_iter) ? in2 + ilm1 : 0; int j1e = (is_valid_iter) ? in2 + ilm1e : 0; // End of index 1 magnetic quantum numbers // Index 2 magnetic quantum numbers int m2 = (is_valid_iter) ? im2 - l2po : 0; int ilm2 = (is_valid_iter) ? il2 + m2 : 0; int ilm2e = (is_valid_iter) ? ilm2 + ndi : 0; int i2 = (is_valid_iter) ? in2 + ilm2 : 0; int i2e = (is_valid_iter) ? in2 + ilm2e : 0; int j2 = (is_valid_iter) ? in1 + ilm2 : 0; int j2e = (is_valid_iter) ? in1 + ilm2e : 0; // End of index 2 magnetic quantum numbers dcomplex cgh, cgk; #pragma omp critical { cgh = (is_valid_iter) ? ghit(0, 0, nbl, l1, m1, l2, m2, c1) : cc0; cgk = (is_valid_iter) ? ghit(0, 1, nbl, l1, m1, l2, m2, c1) : cc0; } if (is_valid_iter) { am[i1 - 1][i2 - 1] = cgh; am[i1 - 1][i2e - 1] = cgk; am[i1e - 1][i2 - 1] = cgk; am[i1e - 1][i2e - 1] = cgh; am[j1 - 1][j2 - 1] = cgh * (1.0 * ish); am[j1 - 1][j2e - 1] = cgk * (1.0 * isk); am[j1e - 1][j2 - 1] = cgk * (1.0 * isk); am[j1e - 1][j2e - 1] = cgh * (1.0 * ish); } // im2 loop } // l2 loop } // im1 loop } // l1 loop } // n2 loop } // n1 loop am[j1 - 1][j2 - 1] = cgh * rsh; am[j1 - 1][j2e - 1] = cgk * rsk; am[j1e - 1][j2 - 1] = cgk * rsk; am[j1e - 1][j2e - 1] = cgh * rsh; } } delete[] lut_n1; delete[] lut_n2; int max_l1tpo = 2 * c1->li + 1; np_int diag_iters = c1->li * max_l1tpo * c1->nsph; np_int diag_iters = li * max_litpo * nsph; #pragma omp parallel for collapse(3) for (int n1 = 1; n1 <= c1->nsph; n1++) { for (int l1 = 1; l1 <= c1->li; l1++) { // GPU portable for (int im1 = 1; im1 <= max_l1tpo; im1++) { dcomplex scratch_e, scratch_m; int in1 = (n1 - 1) * c1->nlim; for (int n1 = 1; n1 <= nsph; n1++) { for (int l1 = 1; l1 <= li; l1++) { // GPU portable for (int im1 = 1; im1 <= max_litpo; im1++) { dcomplex result_e, result_m; int in1 = (n1 - 1) * nlim; dcomplex dm = c1->rmi[l1 - 1][n1 - 1]; dcomplex de = c1->rei[l1 - 1][n1 - 1]; int l1po = l1 + 1; Loading @@ -593,15 +636,19 @@ void cms_gpu(dcomplex **am, ParticleDescriptor *c1) { int ilm1 = (im1 <= l1tpo) ? il1 + m1 : 0; int i1 = (im1 <= l1tpo) ? in1 + ilm1 : 0; int i1e = (im1 <= l1tpo) ? i1 + ndi : 0; dcomplex *target_m = (im1 <= l1tpo) ? (am[i1 - 1] + i1 - 1) : &scratch_m; dcomplex *target_e = (im1 <= l1tpo) ? (am[i1e - 1] + i1e - 1) : &scratch_e; *target_m = dm; *target_e = de; // dcomplex *target_m = (im1 <= l1tpo) ? (am[i1 - 1] + i1 - 1) : &scratch_m; // dcomplex *target_e = (im1 <= l1tpo) ? (am[i1e - 1] + i1e - 1) : &scratch_e; result_m = dm; result_e = de; if (im1 <= l1tpo) { am[i1 - 1][i1 -1] = result_m; am[i1e - 1][i1e -1] = result_e; } } // im1 loop } // l1 loop } // n1 loop } #endif // USE_OFFLOAD #endif // USE_TARGET_OFFLOAD void crsm1(double vk, double exri, ParticleDescriptor *c1) { dcomplex ***svf, ***svw, **svs; Loading