Commit 03adc9d6 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Merge branch 'master' into script_devel updating to GPU optimized trapping

parents b33deb22 c1fe34a6
Loading
Loading
Loading
Loading
+6 −6
Original line number Original line Diff line number Diff line
@@ -364,8 +364,8 @@ result=$?
if [ "x$result" = "x0" ]; then
if [ "x$result" = "x0" ]; then
    CLANGFLAGS=" -stdlib=libstdc++"
    CLANGFLAGS=" -stdlib=libstdc++"
fi
fi
echo -n "configure: checking wether $CXX works... "
echo -n "configure: checking whether $CXX works... "
echo -n "configure: checking wether $CXX works... " >>configure.log
echo -n "configure: checking whether $CXX works... " >>configure.log
cat > test_compiler.cpp <<EOF
cat > test_compiler.cpp <<EOF
int main() {
int main() {
  int i = -1;
  int i = -1;
@@ -385,8 +385,8 @@ else
    echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log
    echo "ERROR: $CXX is not a working C++ compiler!" >>configure.log
    exit 2
    exit 2
fi
fi
echo -n "configure: checking wether $CXX supports -ggdb... "
echo -n "configure: checking whether $CXX supports -ggdb... "
echo -n "configure: checking wether $CXX supports -ggdb... " >>configure.log
echo -n "configure: checking whether $CXX supports -ggdb... " >>configure.log
$CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log
$CXX $CLANGFLAGS -ggdb test_compiler.cpp -o test_compiler > /dev/null 2>>error.log
result=$?
result=$?
if [ "x$result" = "x0" ]; then
if [ "x$result" = "x0" ]; then
@@ -410,8 +410,8 @@ else
    echo "no"
    echo "no"
    echo "no" >>configure.log
    echo "no" >>configure.log
fi
fi
echo -n "configure: checking wether $CXX is a MPI compiler... "
echo -n "configure: checking whether $CXX is a MPI compiler... "
echo -n "configure: checking wether $CXX is a MPI compiler... " >>configure.log
echo -n "configure: checking whether $CXX is a MPI compiler... " >>configure.log
cat > test_compiler.cpp <<EOF
cat > test_compiler.cpp <<EOF
# include <mpi.h>
# include <mpi.h>
int main() {
int main() {
+20 −30
Original line number Original line Diff line number Diff line
@@ -28,7 +28,7 @@ class Swap1 {
protected:
protected:
  //! Index of the last element to be filled.
  //! Index of the last element to be filled.
  int _last_index;
  int _last_index;
  //! Number of vector coordinates. QUESTION: correct?
  //! Number of beam description wave-numbers.
  int _nkv;
  int _nkv;
  //! NLMMT = 2 * LM * (LM + 2)
  //! NLMMT = 2 * LM * (LM + 2)
  int _nlmmt;
  int _nlmmt;
@@ -69,7 +69,7 @@ public:
  /*! \brief Swap1 instance constructor.
  /*! \brief Swap1 instance constructor.
   *
   *
   * \param lm: `int` Maximum field expansion order.
   * \param lm: `int` Maximum field expansion order.
   * \param _nkv: `int` Number of vector coordinates. QUESTION: correct?
   * \param nkv: `int` Number of beam description wave numbers.
   */
   */
  Swap1(int lm, int nkv);
  Swap1(int lm, int nkv);


@@ -130,10 +130,8 @@ protected:
  int _last_vector;
  int _last_vector;
  //! Index of the last matrix element to be filled.
  //! Index of the last matrix element to be filled.
  int _last_matrix;
  int _last_matrix;
  //! Number of vector coordinates. QUESTION: correct?
  //! Number of beam description wave numbers.
  int _nkv;
  int _nkv;
  //! Contiguous vector of VKZM matrix.
  double *vec_vkzm;
  //! QUESTION: definition?
  //! QUESTION: definition?
  double _apfafa;
  double _apfafa;
  //! QUESTION: definition?
  //! QUESTION: definition?
@@ -152,13 +150,13 @@ protected:
  double _delxyz;
  double _delxyz;
  //! QUESTION: definition?
  //! QUESTION: definition?
  double _vknmx;
  double _vknmx;
  //! QUESTION: definition?
  //! Wave number grid spacing.
  double _delk;
  double _delk;
  //! QUESTION: definition?
  //! Square of wave number grid spacing.
  double _delks;
  double _delks;
  //! NLMMT = LM * (LM + 2) * 2
  //! NLMMT = LM * (LM + 2) * 2
  int _nlmmt;
  int _nlmmt;
  //! Number of radial vector coordinates. QUESTION: correct?
  //! Number of radial vector coordinates.
  int _nrvc;
  int _nrvc;


  /*! \brief Load a Swap2 instance from a HDF5 binary file.
  /*! \brief Load a Swap2 instance from a HDF5 binary file.
@@ -192,12 +190,12 @@ public:
  const int &last_vector = _last_vector;
  const int &last_vector = _last_vector;
  //! Read-only view on the index of the last matrix element to be filled.
  //! Read-only view on the index of the last matrix element to be filled.
  const int &last_matrix = _last_matrix;
  const int &last_matrix = _last_matrix;
  //! Read-only view on the number of vector coordinates. QUESTION: correct?
  //! Read-only view on the number of beam description wave numbers.
  const int &nkv = _nkv;
  const int &nkv = _nkv;
  //! QUESTION: definition?
  //! QUESTION: definition?
  double *vkv;
  double *vkv;
  //! QUESTION: definition?
  //! QUESTION: definition?
  double **vkzm;
  double *vec_vkzm;
  //! QUESTION: definition?
  //! QUESTION: definition?
  const double &apfafa = _apfafa;
  const double &apfafa = _apfafa;
  //! QUESTION: definition?
  //! QUESTION: definition?
@@ -222,12 +220,12 @@ public:
  const double &delks = _delks;
  const double &delks = _delks;
  //! NLMMT = LM * (LM + 2) * 2
  //! NLMMT = LM * (LM + 2) * 2
  const int &nlmmt = _nlmmt;
  const int &nlmmt = _nlmmt;
  //! Number of radial vector coordinates. QUESTION: correct?
  //! Read-only view on the number of radial vector coordinates.
  const int &nrvc = _nrvc;
  const int &nrvc = _nrvc;


  /*! \brief Swap2 instance constructor.
  /*! \brief Swap2 instance constructor.
   *
   *
   * \param nkv: `int` Number of vector coordinates. QUESTION: correct?
   * \param nkv: `int` Number of beam description wave numbers.
   */
   */
  Swap2(int nkv);
  Swap2(int nkv);


@@ -244,15 +242,9 @@ public:
   */
   */
  static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY");
  static Swap2* from_binary(const std::string& file_name, const std::string& mode="LEGACY");


  /*! \brief Get the pointer to the VKZM matrix.
   *
   * \return value: `double **` Pointer to the VKZM matrix.
   */
  double **get_matrix() { return vkzm; }

  /*! \brief Calculate the necessary amount of memory to create a new instance.
  /*! \brief Calculate the necessary amount of memory to create a new instance.
   *
   *
   * \param nkv: `int` Number of radial vector coordinates. QUESTION: correct?
   * \param nkv: `int` Number of beam description wave numbers.
   * \return size: `long` The necessary memory size in bytes.
   * \return size: `long` The necessary memory size in bytes.
   */
   */
  static long get_size(int nkv);
  static long get_size(int nkv);
@@ -316,11 +308,11 @@ protected:
  int _nlmmt;
  int _nlmmt;
  //! NRVC = NXV * NYV * NZV
  //! NRVC = NXV * NYV * NZV
  int _nrvc;
  int _nrvc;
  //! Field expansion mode identifier.
  //! Beam description mode.
  int _lmode;
  int _lmode;
  //! Maximum field expansion order.
  //! Maximum field expansion order.
  int _lm;
  int _lm;
  //! QUESTION: definition?
  //! Number of beam description wave numbers.
  int _nkv;
  int _nkv;
  //! Number of computed X coordinates.
  //! Number of computed X coordinates.
  int _nxv;
  int _nxv;
@@ -332,11 +324,11 @@ protected:
  double _vk;
  double _vk;
  //! External medium refractive index
  //! External medium refractive index
  double _exri;
  double _exri;
  //! QUESTION: definition?
  //! Numerical aperture.
  double _an;
  double _an;
  //! QUESTION: definition?
  //! Filling factor.
  double _ff;
  double _ff;
  //! QUESTION: definition?
  //! Lens transmission.
  double _tra;
  double _tra;
  //! QUESTION: definition?
  //! QUESTION: definition?
  double _spd;
  double _spd;
@@ -350,8 +342,6 @@ protected:
  double *yv;
  double *yv;
  //! Vector of computed z positions
  //! Vector of computed z positions
  double *zv;
  double *zv;
  //! QUESTION: definition?
  dcomplex *vec_wsum;


  /*! \brief Load a configuration instance from a HDF5 binary file.
  /*! \brief Load a configuration instance from a HDF5 binary file.
   *
   *
@@ -402,11 +392,11 @@ public:
  const double& vk = _vk;
  const double& vk = _vk;
  //! Read-only view on external medium refractive index
  //! Read-only view on external medium refractive index
  const double& exri = _exri;
  const double& exri = _exri;
  //! QUESTION: definition?
  //! Read-only view on numeric aperture.
  const double& an = _an;
  const double& an = _an;
  //! QUESTION: definition?
  //! Read-only view on filling factor.
  const double& ff = _ff;
  const double& ff = _ff;
  //! QUESTION: definition?
  //! Read-only view on lens transmission.
  const double& tra = _tra;
  const double& tra = _tra;
  //! QUESTION: definition?
  //! QUESTION: definition?
  const double& spd = _spd;
  const double& spd = _spd;
@@ -415,7 +405,7 @@ public:
  //! QUESTION: definition?
  //! QUESTION: definition?
  const double& exril = _exril;
  const double& exril = _exril;
  //! QUESTION: definition?
  //! QUESTION: definition?
  dcomplex **wsum;
  dcomplex *vec_wsum;
  
  
  /*! \brief Trapping configuration instance constructor.
  /*! \brief Trapping configuration instance constructor.
   *
   *
+4 −2
Original line number Original line Diff line number Diff line
@@ -340,13 +340,15 @@ GeometryConfiguration* GeometryConfiguration::from_legacy(const std::string& fil
  // Read optional configuration data used only by the C++ code.
  // Read optional configuration data used only by the C++ code.
  while (num_lines > last_read_line) {
  while (num_lines > last_read_line) {
    str_target = file_lines[last_read_line++];
    str_target = file_lines[last_read_line++];
    if (str_target.size() > 0) {
    if (str_target.size() > 15) {
      if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) {
      if (str_target.substr(0, 15).compare("USE_REFINEMENT=") == 0) {
	regex_search(str_target, m, re);
	regex_search(str_target, m, re);
	short refine_flag = (short)stoi(m.str());
	short refine_flag = (short)stoi(m.str());
	conf->_refine_flag = refine_flag;
	conf->_refine_flag = refine_flag;
      }
      }
      else if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) {
    }
    if (str_target.size() > 14) {
      if (str_target.substr(0, 14).compare("USE_DYN_ORDER=") == 0) {
	regex_search(str_target, m, re);
	regex_search(str_target, m, re);
	short dyn_order_flag = (short)stoi(m.str());
	short dyn_order_flag = (short)stoi(m.str());
	conf->_dyn_order_flag = dyn_order_flag;
	conf->_dyn_order_flag = dyn_order_flag;
+115 −102
Original line number Original line Diff line number Diff line
@@ -47,10 +47,6 @@
#include <omp.h>
#include <omp.h>
#endif
#endif


#ifdef USE_TARGET_OFFLOAD
#pragma omp requires unified_shared_memory
#endif

using namespace std;
using namespace std;


void apc(
void apc(
@@ -407,9 +403,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) {
  return result;
  return result;
}
}


#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp begin declare target device_type(any)
// #pragma omp begin declare target device_type(any)
#endif
// #endif
double cgev(int ipamo, int mu, int l, int m) {
double cgev(int ipamo, int mu, int l, int m) {
  double result = 0.0;
  double result = 0.0;
  double xd = 0.0, xn = 0.0;
  double xd = 0.0, xn = 0.0;
@@ -443,9 +439,9 @@ double cgev(int ipamo, int mu, int l, int m) {
  }
  }
  return result;
  return result;
}
}
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp end declare target
// #pragma omp end declare target
#endif
// #endif


void cms(dcomplex **am, ParticleDescriptor *c1) {
void cms(dcomplex **am, ParticleDescriptor *c1) {
  dcomplex dm, de, cgh, cgk;
  dcomplex dm, de, cgh, cgk;
@@ -649,9 +645,9 @@ void crsm1(double vk, double exri, ParticleDescriptor *c1) {
  delete[] svs;
  delete[] svs;
}
}


#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp begin declare target device_type(any)
// #pragma omp begin declare target device_type(any)
#endif
// #endif
dcomplex ghit_d(
dcomplex ghit_d(
	      int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	      int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	      ParticleDescriptor *c1, double *rac3j
	      ParticleDescriptor *c1, double *rac3j
@@ -862,13 +858,13 @@ dcomplex ghit_d(
  }
  }
  return result;
  return result;
}
}
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp end declare target
// #pragma omp end declare target
#endif
// #endif


#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp begin declare target device_type(any)
// #pragma omp begin declare target device_type(any)
#endif
// #endif
dcomplex ghit(
dcomplex ghit(
	      int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	      int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2,
	      ParticleDescriptor *c1
	      ParticleDescriptor *c1
@@ -1079,9 +1075,9 @@ dcomplex ghit(
  }
  }
  return result;
  return result;
}
}
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp end declare target
// #pragma omp end declare target
#endif
// #endif


void hjv(
void hjv(
	 double exri, double vk, int &jer, int &lcalc, dcomplex &arg,
	 double exri, double vk, int &jer, int &lcalc, dcomplex &arg,
@@ -1339,11 +1335,12 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) {
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePush("pcros intermediate loop 1");
  nvtxRangePush("pcros intermediate loop 1");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
// #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
#else
// #else
// #pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
// #endif
#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
#endif
  for (int i12 = 0; i12 < nlemt; i12++) {
  for (int i12 = 0; i12 < nlemt; i12++) {
      // int i = i12 - 1;
      // int i = i12 - 1;
      dcomplex am = cc0;
      dcomplex am = cc0;
@@ -1408,11 +1405,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
  csam = -(ccs / (exri * vk)) * 0.5 * I;
  csam = -(ccs / (exri * vk)) * 0.5 * I;
  sum2 = cc0;
  sum2 = cc0;
  sum3 = cc0;
  sum3 = cc0;
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
// #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
#else
// #else
// #pragma omp parallel for simd reduction(+:sum2,sum3)
// #endif
#pragma omp parallel for simd reduction(+:sum2,sum3)
#pragma omp parallel for simd reduction(+:sum2,sum3)
#endif
  for (int i14 = 0; i14 < c1->nlem; i14++) { 
  for (int i14 = 0; i14 < c1->nlem; i14++) { 
    int ie = i14 + c1->nlem;
    int ie = i14 + c1->nlem;
    sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]);
    sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]);
@@ -1420,11 +1418,12 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
  } // i14 loop
  } // i14 loop
  double sumpi = 0.0;
  double sumpi = 0.0;
  dcomplex sumpd = cc0;
  dcomplex sumpd = cc0;
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
// #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
#else
// #else
// #pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
// #endif
#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
#endif
  for (int i16 = 0; i16 < nlemt; i16++) {
  for (int i16 = 0; i16 < nlemt; i16++) {
    for (int j16 = 0; j16 < c1->nlem; j16++) {
    for (int j16 = 0; j16 < c1->nlem; j16++) {
      int je = j16 + c1->nlem;
      int je = j16 + c1->nlem;
@@ -1628,9 +1627,9 @@ void r3j000(int j2, int j3, double *rac3j) {
  }
  }
}
}


#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp begin declare target device_type(any)
// #pragma omp begin declare target device_type(any)
#endif
// #endif
void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
  int jmx = j3 + j2;
  int jmx = j3 + j2;
  int jdf = j3 - j2;
  int jdf = j3 - j2;
@@ -1748,13 +1747,13 @@ void r3jjr(int j2, int j3, int m2, int m3, double *rac3j) {
    }
    }
  }
  }
}
}
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp end declare target
// #pragma omp end declare target
#endif
// #endif


#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp begin declare target device_type(any)
// #pragma omp begin declare target device_type(any)
#endif
// #endif
void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
  int jmx = j3 + j2;
  int jmx = j3 + j2;
  int jdf = j3 - j2;
  int jdf = j3 - j2;
@@ -1872,9 +1871,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) {
    }
    }
  }
  }
}
}
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp end declare target
// #pragma omp end declare target
#endif
// #endif


void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) {
void r3jmr(int j1, int j2, int j3, int m1, double *rac3j) {
  int mmx = (j2 < j3 - m1) ? j2 : j3 - m1;
  int mmx = (j2 < j3 - m1) ? j2 : j3 - m1;
@@ -2005,11 +2004,12 @@ void raba(
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePush("raba inner loop 1");
  nvtxRangePush("raba inner loop 1");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
// #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
#else
// #else
// #pragma omp parallel for simd reduction(+:c1, c2)
// #endif
#pragma omp parallel for simd reduction(+:c1, c2)
#pragma omp parallel for simd reduction(+:c1, c2)
#endif
  for (int j10 = 1; j10 <= nlemt; j10++) {
  for (int j10 = 1; j10 <= nlemt; j10++) {
      int j = j10 - 1;
      int j = j10 - 1;
      c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]);
      c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]);
@@ -2027,11 +2027,12 @@ void raba(
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePush("raba outer loop 2");
  nvtxRangePush("raba outer loop 2");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp teams distribute parallel for
// #pragma omp teams distribute parallel for
#else
// #else
// #pragma omp parallel for
// #endif
#pragma omp parallel for
#pragma omp parallel for
#endif
  for (int ipo = 0; ipo < 2; ipo++) {
  for (int ipo = 0; ipo < 2; ipo++) {
    int jpo = 1 - ipo;
    int jpo = 1 - ipo;
    ctqce[ipo][0] = cc0;
    ctqce[ipo][0] = cc0;
@@ -2063,11 +2064,12 @@ void raba(
#ifdef USE_NVTX
#ifdef USE_NVTX
    nvtxRangePush("raba inner loop 2");
    nvtxRangePush("raba inner loop 2");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
// #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
#else
// #else
// #pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
// #endif
#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
#endif
    for (int k = 1; k<=kmax; k++) {
    for (int k = 1; k<=kmax; k++) {
      int l60 = (int) sqrt(k+1);
      int l60 = (int) sqrt(k+1);
      int im60 = k - (l60*l60) + 1;
      int im60 = k - (l60*l60) + 1;
@@ -2140,11 +2142,12 @@ void raba(
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePush("raba loop 3");
  nvtxRangePush("raba loop 3");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd
// #pragma omp target teams distribute parallel for simd
#else
// #else
// #pragma omp parallel for simd
// #endif
#pragma omp parallel for simd
#pragma omp parallel for simd
#endif
  for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
  for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
    int ipo = ipo78 - 1;
    int ipo = ipo78 - 1;
    tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i;
    tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i;
@@ -2214,11 +2217,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
#ifdef USE_NVTX
#ifdef USE_NVTX
      nvtxRangePush("scr0 inner loop 1");
      nvtxRangePush("scr0 inner loop 1");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
// #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
#else
// #else
// #pragma omp parallel for simd reduction(+:sums, sum21)
// #endif
#pragma omp parallel for simd reduction(+:sums, sum21)
#pragma omp parallel for simd reduction(+:sums, sum21)
#endif
      for (int l10 = 1; l10 <= c1->li; l10++) {
      for (int l10 = 1; l10 <= c1->li; l10++) {
	double fl = 1.0 * (l10 + l10 + 1);
	double fl = 1.0 * (l10 + l10 + 1);
	// dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1];
	// dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1];
@@ -2262,11 +2266,12 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePush("scr0 loop 2");
  nvtxRangePush("scr0 loop 2");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
// #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
#else
// #else
// #pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
// #endif
#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
#endif
  for (int i14 = 1; i14 <= c1->nsph; i14++) {
  for (int i14 = 1; i14 <= c1->nsph; i14++) {
    int iogi = c1->iog[i14 - 1];
    int iogi = c1->iog[i14 - 1];
    scs += c1->sscs[iogi - 1];
    scs += c1->sscs[iogi - 1];
@@ -2328,11 +2333,12 @@ void scr2(
#ifdef USE_NVTX
#ifdef USE_NVTX
      nvtxRangePush("scr2 inner loop 1");
      nvtxRangePush("scr2 inner loop 1");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
// #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
#else
// #else
// #pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
// #endif
#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
#endif
      for (int k = 1; k<=kmax; k++) {
      for (int k = 1; k<=kmax; k++) {
	int l10 = (int) sqrt(k+1);
	int l10 = (int) sqrt(k+1);
	int im10 = k - (l10*l10) + 1;
	int im10 = k - (l10*l10) + 1;
@@ -2384,11 +2390,12 @@ void scr2(
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePush("scr2 loop 2");
  nvtxRangePush("scr2 loop 2");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
// #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
#else
// #else
// #pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
// #endif
#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
#endif
  for (int i14 = 1; i14 <= c1->nsph; i14++) {
  for (int i14 = 1; i14 <= c1->nsph; i14++) {
    int i = i14 - 1;
    int i = i14 - 1;
    int iogi = c1->iog[i14 - 1];
    int iogi = c1->iog[i14 - 1];
@@ -2418,11 +2425,12 @@ void scr2(
#ifdef USE_NVTX
#ifdef USE_NVTX
      nvtxRangePush("scr2 inner loop 3");
      nvtxRangePush("scr2 inner loop 3");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(4)
// #pragma omp target teams distribute parallel for simd collapse(4)
#else
// #else
// #pragma omp parallel for simd collapse(4)
// #endif
#pragma omp parallel for simd collapse(4)
#pragma omp parallel for simd collapse(4)
#endif
      for (int ipo1 = 1; ipo1 <=2; ipo1++) {
      for (int ipo1 = 1; ipo1 <=2; ipo1++) {
	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
	  for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
	  for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
@@ -2445,11 +2453,12 @@ void scr2(
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePush("scr2 loop 4");
  nvtxRangePush("scr2 loop 4");
#endif
#endif
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for collapse(4)
// #pragma omp target teams distribute parallel for collapse(4)
#else
// #else
// #pragma omp parallel for collapse(4)
// #endif
#pragma omp parallel for collapse(4)
#pragma omp parallel for collapse(4)
#endif
  for (int ipo1 = 1; ipo1 <=2; ipo1++) {
  for (int ipo1 = 1; ipo1 <=2; ipo1++) {
    for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
    for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
      for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
      for (int ipo2 = 1; ipo2 <= 2; ipo2++) {
@@ -2582,11 +2591,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
  // but if it results im = 0, then we set l = l-1 and im = 2*l+1
  // but if it results im = 0, then we set l = l-1 and im = 2*l+1
  // furthermore if it results im > 2*l+1, then we set
  // furthermore if it results im > 2*l+1, then we set
  // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
  // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(3)
// #pragma omp target teams distribute parallel for simd collapse(3)
#else
// #else
// #pragma omp parallel for simd collapse(3)
// #endif
#pragma omp parallel for simd collapse(3)
#pragma omp parallel for simd collapse(3)
#endif
  for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
  for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
    for (int k2 = 1; k2<=k2max; k2++) {
    for (int k2 = 1; k2<=k2max; k2++) {
      for (int k3 = 1; k3<=k3max; k3++) {
      for (int k3 = 1; k3<=k3max; k3++) {
@@ -2632,11 +2642,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
#endif
#endif
  dcomplex *am_v = am[0];
  dcomplex *am_v = am[0];
  dcomplex *sam_v = c1->sam[0];
  dcomplex *sam_v = c1->sam[0];
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
// #pragma omp target teams distribute parallel for simd collapse(2)
#else
// #else
// #pragma omp parallel for simd collapse(2)
// #endif
#pragma omp parallel for simd collapse(2)
#pragma omp parallel for simd collapse(2)
#endif
  for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
  for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
    for (int i3 = 1; i3 <= c1->nlem; i3++) {
    for (int i3 = 1; i3 <= c1->nlem; i3++) {
      dcomplex sum1 = cc0;
      dcomplex sum1 = cc0;
@@ -2669,11 +2680,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
      sam_v[vecind1e + i3e - 1] = sum4;
      sam_v[vecind1e + i3e - 1] = sum4;
    } // i3 loop
    } // i3 loop
  } // i1 loop
  } // i1 loop
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
// #pragma omp target teams distribute parallel for simd collapse(2)
#else
// #else
// #pragma omp parallel for simd collapse(2)
// #endif 
#pragma omp parallel for simd collapse(2)
#pragma omp parallel for simd collapse(2)
#endif 
  for (int i1 = 1; i1 <= c1->ndi; i1++) {
  for (int i1 = 1; i1 <= c1->ndi; i1++) {
    for (int i0 = 1; i0 <= c1->nlem; i0++) {
    for (int i0 = 1; i0 <= c1->nlem; i0++) {
      int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
      int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
@@ -2682,11 +2694,12 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
    } // i0 loop
    } // i0 loop
  } // i1 loop
  } // i1 loop
  dcomplex *vec_am0m = c1->am0m[0];
  dcomplex *vec_am0m = c1->am0m[0];
#ifdef USE_TARGET_OFFLOAD
// #ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
// #pragma omp target teams distribute parallel for simd collapse(2)
#else
// #else
// #pragma omp parallel for simd collapse(2)
// #endif
#pragma omp parallel for simd collapse(2)
#pragma omp parallel for simd collapse(2)
#endif
  for (int i0 = 1; i0 <= c1->nlem; i0++) {
  for (int i0 = 1; i0 <= c1->nlem; i0++) {
    for (int i3 = 1; i3 <= c1->nlemt; i3++) {
    for (int i3 = 1; i3 <= c1->nlemt; i3++) {
      int i0e = i0 + c1->nlem;
      int i0e = i0 + c1->nlem;
+54 −23
Original line number Original line Diff line number Diff line
@@ -44,6 +44,10 @@
#include "../include/file_io.h"
#include "../include/file_io.h"
#endif
#endif


#ifdef USE_TARGET_OFFLOAD
#include <cstdlib>
#endif

using namespace std;
using namespace std;


// >>> START OF Swap1 CLASS IMPLEMENTATION <<<
// >>> START OF Swap1 CLASS IMPLEMENTATION <<<
@@ -220,18 +224,32 @@ bool Swap1::operator ==(Swap1 &other) {
// >>> START OF Swap2 CLASS IMPLEMENTATION <<<
// >>> START OF Swap2 CLASS IMPLEMENTATION <<<
Swap2::Swap2(int nkv) {
Swap2::Swap2(int nkv) {
  _nkv = nkv;
  _nkv = nkv;
#ifdef USE_TARGET_OFFLOAD
  vkv = (double *)aligned_alloc(64, _nkv * sizeof(double));
  vec_vkzm = (double *)aligned_alloc(64, _nkv * _nkv * sizeof(double));
#pragma omp parallel for collapse(2)
  for (int i = 0; i < _nkv; i++) {
    for (int j = 0; j < _nkv; j++) {
      vkv[i] = 0.0;
      vec_vkzm[_nkv * i +j] = 0.0;
    }
  }
#else
  vkv = new double[_nkv]();
  vkv = new double[_nkv]();
  vec_vkzm = new double[_nkv * _nkv]();
  vec_vkzm = new double[_nkv * _nkv]();
  vkzm = new double*[nkv];
#endif // USE TARGET_OFFLOAD
  for (int vi = 0; vi < _nkv; vi++) vkzm[vi] = vec_vkzm + vi * _nkv;
  _last_vector = 0;
  _last_vector = 0;
  _last_matrix = 0;
  _last_matrix = 0;
}
}


Swap2::~Swap2() {
Swap2::~Swap2() {
#ifdef USE_TARGET_OFFLOAD
  free(vkv);
  free(vec_vkzm);
#else
  delete[] vkv;
  delete[] vkv;
  delete[] vec_vkzm;
  delete[] vec_vkzm;
  delete[] vkzm;
#endif // USE_TARGET_OFFLOAD
}
}


Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) {
Swap2* Swap2::from_binary(const std::string& file_name, const std::string& mode) {
@@ -298,14 +316,14 @@ Swap2* Swap2::from_legacy(const std::string& file_name) {
  fstream input;
  fstream input;
  Swap2 *instance = NULL;
  Swap2 *instance = NULL;
  int fnkv, fnlmmt, fnrvc;
  int fnkv, fnlmmt, fnrvc;
  double **fvkzm = NULL;
  double *fvkzm = NULL;
  double *fvkv = NULL;
  double *fvkv = NULL;
  double value;
  double value;
  input.open(file_name.c_str(), ios::in | ios::binary);
  input.open(file_name.c_str(), ios::in | ios::binary);
  if (input.is_open()) {
  if (input.is_open()) {
    input.read(reinterpret_cast<char *>(&fnkv), sizeof(int));
    input.read(reinterpret_cast<char *>(&fnkv), sizeof(int));
    instance = new Swap2(fnkv);
    instance = new Swap2(fnkv);
    fvkzm = instance->get_matrix();
    fvkzm = instance->vec_vkzm;
    fvkv = instance->get_vector();
    fvkv = instance->get_vector();
    for (int vj = 0; vj < fnkv; vj++) {
    for (int vj = 0; vj < fnkv; vj++) {
      input.read(reinterpret_cast<char *>(&value), sizeof(double));
      input.read(reinterpret_cast<char *>(&value), sizeof(double));
@@ -314,7 +332,7 @@ Swap2* Swap2::from_legacy(const std::string& file_name) {
    for (int mi = 0; mi < fnkv; mi++) {
    for (int mi = 0; mi < fnkv; mi++) {
      for (int mj = 0; mj < fnkv; mj++) {
      for (int mj = 0; mj < fnkv; mj++) {
	input.read(reinterpret_cast<char *>(&value), sizeof(double));
	input.read(reinterpret_cast<char *>(&value), sizeof(double));
	fvkzm[mi][mj] = value;
	fvkzm[fnkv * mi + mj] = value;
      }
      }
    }
    }
    input.read(reinterpret_cast<char *>(&value), sizeof(double));
    input.read(reinterpret_cast<char *>(&value), sizeof(double));
@@ -359,7 +377,7 @@ long Swap2::get_size(int nkv) {
void Swap2::push_matrix(double value) {
void Swap2::push_matrix(double value) {
  int col = _last_matrix % (_nkv - 1);
  int col = _last_matrix % (_nkv - 1);
  int row = _last_matrix - (_nkv * row);
  int row = _last_matrix - (_nkv * row);
  vkzm[row][col] = value;
  vec_vkzm[nkv * row + col] = value;
  _last_matrix++;
  _last_matrix++;
}
}


@@ -480,7 +498,7 @@ void Swap2::write_legacy(const std::string& file_name) {
    }
    }
    for (int mi = 0; mi < _nkv; mi++) {
    for (int mi = 0; mi < _nkv; mi++) {
      for (int mj = 0; mj < _nkv; mj++) {
      for (int mj = 0; mj < _nkv; mj++) {
	value = vkzm[mi][mj];
	value = vec_vkzm[nkv * mi + mj];
	output.write(reinterpret_cast<const char*>(&value), sizeof(double));
	output.write(reinterpret_cast<const char*>(&value), sizeof(double));
      }
      }
    }
    }
@@ -552,8 +570,9 @@ bool Swap2::operator ==(Swap2 &other) {
    }
    }
  }
  }
  for (int mi = 0; mi < _nkv; mi++) {
  for (int mi = 0; mi < _nkv; mi++) {
    int nkvi = nkv * mi;
    for (int mj = 0; mj < _nkv; mj++) {
    for (int mj = 0; mj < _nkv; mj++) {
      if (vkzm[mi][mj] != other.vkzm[mi][mj]) {
      if (vec_vkzm[nkvi + mj] != other.vec_vkzm[nkvi + mj]) {
	return false;
	return false;
      }
      }
    }
    }
@@ -580,22 +599,33 @@ TFRFME::TFRFME(int lmode, int lm, int nkv, int nxv, int nyv, int nzv) {
  _exril = 0.0;
  _exril = 0.0;


  // Array initialization
  // Array initialization
  xv = new double[nxv]();
  yv = new double[nyv]();
  zv = new double[nzv]();
  _nlmmt = _lm * (_lm + 2) * 2;
  _nlmmt = _lm * (_lm + 2) * 2;
  _nrvc = _nxv * _nyv * _nzv;
  _nrvc = _nxv * _nyv * _nzv;
  vec_wsum = new dcomplex[nrvc * nlmmt]();
#ifdef USE_TARGET_OFFLOAD
  wsum = new dcomplex*[nlmmt];
  xv = (double *)aligned_alloc(64, sizeof(double) * _nxv);
  for (int wi = 0; wi < nlmmt; wi++) wsum[wi] = vec_wsum + wi * nrvc;
  yv = (double *)aligned_alloc(64, sizeof(double) * _nyv);
  zv = (double *)aligned_alloc(64, sizeof(double) * _nzv);
  vec_wsum = (dcomplex *)aligned_alloc(64, sizeof(dcomplex) * _nrvc * _nlmmt);
#else
  xv = new double[_nxv]();
  yv = new double[_nyv]();
  zv = new double[_nzv]();
  vec_wsum = new dcomplex[_nrvc * _nlmmt]();
#endif // USE_TARGET_OFFLOAD
}
}


TFRFME::~TFRFME() {
TFRFME::~TFRFME() {
#ifdef USE_TARGET_OFFLOAD
  free(xv);
  free(yv);
  free(zv);
  free(vec_wsum);
#else
  delete[] xv;
  delete[] xv;
  delete[] yv;
  delete[] yv;
  delete[] zv;
  delete[] zv;
  delete[] vec_wsum;
  delete[] vec_wsum;
  delete[] wsum;
#endif
}
}


TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) {
TFRFME* TFRFME::from_binary(const std::string& file_name, const std::string& mode) {
@@ -662,7 +692,7 @@ TFRFME* TFRFME::from_hdf5(const std::string& file_name) {
    for (int wj = 0; wj < nrvc; wj++) {
    for (int wj = 0; wj < nrvc; wj++) {
      for (int wi = 0; wi < nlmmt; wi++) {
      for (int wi = 0; wi < nlmmt; wi++) {
	value = elements[index] + elements[index + 1] * I;
	value = elements[index] + elements[index + 1] * I;
	instance->wsum[wi][wj] = value;
	instance->vec_wsum[nrvc * wi + wj] = value;
	index += 2;
	index += 2;
      } // wi loop
      } // wi loop
    } // wj loop
    } // wj loop
@@ -727,7 +757,7 @@ TFRFME* TFRFME::from_legacy(const std::string& file_name) {
	input.read(reinterpret_cast<char *>(&rval), sizeof(double));
	input.read(reinterpret_cast<char *>(&rval), sizeof(double));
	input.read(reinterpret_cast<char *>(&ival), sizeof(double));
	input.read(reinterpret_cast<char *>(&ival), sizeof(double));
	dcomplex value = rval + ival * I;
	dcomplex value = rval + ival * I;
	instance->wsum[wi][wj] = value;
	instance->vec_wsum[nrvc * wi + wj] = value;
      } // wi loop
      } // wi loop
    } // wj loop
    } // wj loop
    input.close();
    input.close();
@@ -842,8 +872,8 @@ void TFRFME::write_hdf5(const std::string& file_name) {
  int index = 0;
  int index = 0;
  for (int wj = 0; wj < nrvc; wj++) {
  for (int wj = 0; wj < nrvc; wj++) {
    for (int wi = 0; wi < nlmmt; wi++) {
    for (int wi = 0; wi < nlmmt; wi++) {
      ptr_elements[index++] = real(wsum[wi][wj]);
      ptr_elements[index++] = real(vec_wsum[nrvc * wi + wj]);
      ptr_elements[index++] = imag(wsum[wi][wj]);
      ptr_elements[index++] = imag(vec_wsum[nrvc * wi + wj]);
    } // wi loop
    } // wi loop
  } // wj loop
  } // wj loop
  rec_ptr_list.append(ptr_elements);
  rec_ptr_list.append(ptr_elements);
@@ -891,8 +921,8 @@ void TFRFME::write_legacy(const std::string& file_name) {
      output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double));
      output.write(reinterpret_cast<char *>(&(zv[zi])), sizeof(double));
    for (int wj = 0; wj < _nrvc; wj++) {
    for (int wj = 0; wj < _nrvc; wj++) {
      for (int wi = 0; wi < _nlmmt; wi++) {
      for (int wi = 0; wi < _nlmmt; wi++) {
	double rval = real(wsum[wi][wj]);
	double rval = real(vec_wsum[nrvc * wi + wj]);
	double ival = imag(wsum[wi][wj]);
	double ival = imag(vec_wsum[nrvc * wi + wj]);
	output.write(reinterpret_cast<char *>(&rval), sizeof(double));
	output.write(reinterpret_cast<char *>(&rval), sizeof(double));
	output.write(reinterpret_cast<char *>(&ival), sizeof(double));
	output.write(reinterpret_cast<char *>(&ival), sizeof(double));
      } // wi loop
      } // wi loop
@@ -962,8 +992,9 @@ bool TFRFME::operator ==(const TFRFME& other) {
    }
    }
  }
  }
  for (int wi = 0; wi < _nlmmt; wi++) {
  for (int wi = 0; wi < _nlmmt; wi++) {
    int i = _nrvc * wi;
    for (int wj = 0; wj < _nrvc; wj++) {
    for (int wj = 0; wj < _nrvc; wj++) {
      if (wsum[wi][wj] != other.wsum[wi][wj]) {
      if (vec_wsum[i + wj] != other.vec_wsum[i + wj]) {
	return false;
	return false;
      }
      }
    } // wj loop
    } // wj loop
Loading