Loading build/configure.sh +8 −4 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ NVTXFLAGS="" OMPMODE="auto" OFFLOAD="auto" OFFLOADFLAGS="" OFFLOADLDFLAGS="" # End of default configuration settings # Function declarations Loading Loading @@ -875,7 +876,7 @@ int main(int argc, char** argv) { return 0; } EOF $CXX -fopenmp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none="-O3 -ggdb -fcf-protection=none -fno-stack-protector -fopt-info -lm -latomic -lgomp" conf_test_offload.cpp -o conf_test_offload > /dev/null 2>>error.log $CXX -fopenmp -fno-strict-aliasing -foffload=nvptx-none="-O2 -march=sm_70 -mptx=7.3" conf_test_offload.cpp -o conf_test_offload > /dev/null 2>>error.log result=$? rm conf_test_offload.cpp if [ "x$result" = "x0" ]; then Loading @@ -886,7 +887,8 @@ EOF if [ "x$result" = "x0" ]; then echo "yes" echo "yes" >>configure.log OFFLOADFLAGS=" -DUSE_TARGET_OFFLOAD -fno-lto -fcf-protection=none -fno-stack-protector -foffload=nvptx-none=\"-O${CXX_OPT}${CXX_DBG} -fno-lto -fcf-protection=none -fno-stack-protector -fopt-info -lm -latomic -lgomp\"" OFFLOADFLAGS=" -DUSE_TARGET_OFFLOAD -fno-strict-aliasing -foffload=nvptx-none=\"-O2 -march=sm_70 -mptx=7.3\"" OFFLOADLDFLAGS=" -lgomp" if [ "x${OMPFLAGS}" = "x" ]; then OFFLOADFLAGS="-fopenmp ${OFFLOADFLAGS}" fi Loading @@ -894,9 +896,11 @@ EOF echo "no" echo "no" >>configure.log OFFLOADFLAGS="" OFFLOADLDFLAGS="" fi else OFFLOADFLAGS="" OFFLOADLDFLAGS="" fi # End of offload checks if [ "x$CXXFLAGS" = "x" ]; then Loading @@ -904,9 +908,9 @@ if [ "x$CXXFLAGS" = "x" ]; then fi if [ "x$CXXLDFLAGS" = "x" ]; then if [ "x$LIBMODE" = "xstatic" ]; then CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}" CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}${OFFLOADLDFLAGS}" else CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}" CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}${OFFLOADLDFLAGS}" fi fi Loading src/cluster/cluster.cpp +27 −12 Original line number Diff line number Diff line Loading @@ -44,6 +44,10 @@ #include <cuda_runtime.h> #endif #ifdef USE_MAGMA #include "magma_v2.h" #endif #ifndef INCLUDE_TYPES_H_ #include "../include/types.h" #endif Loading Loading @@ -100,6 +104,14 @@ #include "../include/IterationData.h" #endif #ifndef INCLUDE_COMMONS_H_ #include "../include/Commons.h" #endif #ifndef INCLUDE_MAGMA_CALLS_H_ #include "../include/magma_calls.h" #endif using namespace std; /*! \brief Main calculation loop. Loading Loading @@ -150,6 +162,7 @@ void cluster(const string& config_file, const string& data_file, const string& o const magma_int_t d_array_max_size = 32; // TEMPORARY: can become configurable parameter magma_device_t *device_array = new magma_device_t[d_array_max_size]; magma_int_t num_devices; cudaDeviceSetLimit(cudaLimitStackSize, 4096); magma_getdevices(device_array, d_array_max_size, &num_devices); device_count = (int)num_devices; delete[] device_array; Loading Loading @@ -872,11 +885,13 @@ int cluster_jxi488_cycle( outam0->write_to_disk(outam0_name); delete outam0; #endif // DEBUG_AM if (rs.use_offload) { cms_gpu(cid->am, cid->c1); } else { cms(cid->am, cid->c1); } #ifdef USE_TARGET_OFFLOAD #ifdef USE_MAGMA magmaDoubleComplex* vec_am = (magmaDoubleComplex *)(cid->am[0]); magma_cms(vec_am, cid->c1, cid->proc_device); #endif //USE_MAGMA #endif // USE_TARGET_OFFLOAD cms_gpu(cid->am[0], cid->c1); #ifdef DEBUG_AM VirtualAsciiFile *outam1 = new VirtualAsciiFile(); string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt"; Loading @@ -889,10 +904,10 @@ int cluster_jxi488_cycle( write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1); outam1->write_to_disk(outam1_name); delete outam1; #endif #endif // DEBUG_AM #ifdef USE_NVTX nvtxRangePop(); #endif #endif // USE_NVTX interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; Loading @@ -900,7 +915,7 @@ int cluster_jxi488_cycle( interval_start = chrono::high_resolution_clock::now(); #ifdef USE_NVTX nvtxRangePush("Invert the matrix"); #endif #endif // USE_NVTX invert_matrix(cid->am, ndit, jer, output_path, jxi488, mxndm, cid->proc_device, rs); #ifdef DEBUG_AM VirtualAsciiFile *outam2 = new VirtualAsciiFile(); Loading @@ -914,10 +929,10 @@ int cluster_jxi488_cycle( write_dcomplex_matrix(outam2, cid->am, ndit, ndit); outam2->write_to_disk(outam2_name); delete outam2; #endif #endif // DEBUG_AM #ifdef USE_NVTX nvtxRangePop(); #endif #endif // USE_NVTX interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; Loading @@ -931,7 +946,7 @@ int cluster_jxi488_cycle( interval_start = chrono::high_resolution_clock::now(); #ifdef USE_NVTX nvtxRangePush("Average calculation"); #endif #endif // USE_NVTX ztm(cid->am, cid->c1); #ifdef DEBUG_AM VirtualAsciiFile *outam3 = new VirtualAsciiFile(); Loading @@ -945,7 +960,7 @@ int cluster_jxi488_cycle( write_dcomplex_matrix(outam3, cid->am, ndit, ndit); outam3->write_to_disk(outam3_name); delete outam3; #endif #endif // DEBUG_AM if (idfc >= 0) { if (jxi488 == jwtm) { int nlemt = 2 * cid->c1->nlem; Loading src/include/clu_subs.h +1 −1 Original line number Diff line number Diff line Loading @@ -119,7 +119,7 @@ void cms(dcomplex **am, ParticleDescriptor *c1); * \param am: `complex double **` * \param c1: `ParticleDescriptor *` */ void cms_gpu(dcomplex **am, ParticleDescriptor *c1); void cms_gpu(dcomplex *am, ParticleDescriptor *c1); #endif // USE_TARGET_OFFLOAD /** Loading src/include/magma_calls.h +2 −2 Original line number Diff line number Diff line Loading @@ -21,11 +21,11 @@ * */ #include <string> #ifndef INCLUDE_MAGMA_CALLS_H_ #define INCLUDE_MAGMA_CALLS_H_ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int device_id); /** * \brief Invert a complex matrix with double precision elements. * Loading src/libnptm/algebraic.cpp +4 −0 Original line number Diff line number Diff line Loading @@ -41,6 +41,10 @@ using namespace std; #endif #ifdef USE_MAGMA #ifndef INCLUDE_COMMONS_H_ #include "../include/Commons.h" #endif #ifndef INCLUDE_MAGMA_CALLS_H_ #include "../include/magma_calls.h" #endif Loading Loading
build/configure.sh +8 −4 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ NVTXFLAGS="" OMPMODE="auto" OFFLOAD="auto" OFFLOADFLAGS="" OFFLOADLDFLAGS="" # End of default configuration settings # Function declarations Loading Loading @@ -875,7 +876,7 @@ int main(int argc, char** argv) { return 0; } EOF $CXX -fopenmp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none="-O3 -ggdb -fcf-protection=none -fno-stack-protector -fopt-info -lm -latomic -lgomp" conf_test_offload.cpp -o conf_test_offload > /dev/null 2>>error.log $CXX -fopenmp -fno-strict-aliasing -foffload=nvptx-none="-O2 -march=sm_70 -mptx=7.3" conf_test_offload.cpp -o conf_test_offload > /dev/null 2>>error.log result=$? rm conf_test_offload.cpp if [ "x$result" = "x0" ]; then Loading @@ -886,7 +887,8 @@ EOF if [ "x$result" = "x0" ]; then echo "yes" echo "yes" >>configure.log OFFLOADFLAGS=" -DUSE_TARGET_OFFLOAD -fno-lto -fcf-protection=none -fno-stack-protector -foffload=nvptx-none=\"-O${CXX_OPT}${CXX_DBG} -fno-lto -fcf-protection=none -fno-stack-protector -fopt-info -lm -latomic -lgomp\"" OFFLOADFLAGS=" -DUSE_TARGET_OFFLOAD -fno-strict-aliasing -foffload=nvptx-none=\"-O2 -march=sm_70 -mptx=7.3\"" OFFLOADLDFLAGS=" -lgomp" if [ "x${OMPFLAGS}" = "x" ]; then OFFLOADFLAGS="-fopenmp ${OFFLOADFLAGS}" fi Loading @@ -894,9 +896,11 @@ EOF echo "no" echo "no" >>configure.log OFFLOADFLAGS="" OFFLOADLDFLAGS="" fi else OFFLOADFLAGS="" OFFLOADLDFLAGS="" fi # End of offload checks if [ "x$CXXFLAGS" = "x" ]; then Loading @@ -904,9 +908,9 @@ if [ "x$CXXFLAGS" = "x" ]; then fi if [ "x$CXXLDFLAGS" = "x" ]; then if [ "x$LIBMODE" = "xstatic" ]; then CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}" CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}${OFFLOADLDFLAGS}" else CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}" CXXLDFLAGS="-Llibnptm -lnptm ${HDF5LDFLAGS} ${LDFLAGS} ${LAPACKLDFLAGS}${CUBLASLDFLAGS}${MAGMALDFLAGS}${OFFLOADLDFLAGS}" fi fi Loading
src/cluster/cluster.cpp +27 −12 Original line number Diff line number Diff line Loading @@ -44,6 +44,10 @@ #include <cuda_runtime.h> #endif #ifdef USE_MAGMA #include "magma_v2.h" #endif #ifndef INCLUDE_TYPES_H_ #include "../include/types.h" #endif Loading Loading @@ -100,6 +104,14 @@ #include "../include/IterationData.h" #endif #ifndef INCLUDE_COMMONS_H_ #include "../include/Commons.h" #endif #ifndef INCLUDE_MAGMA_CALLS_H_ #include "../include/magma_calls.h" #endif using namespace std; /*! \brief Main calculation loop. Loading Loading @@ -150,6 +162,7 @@ void cluster(const string& config_file, const string& data_file, const string& o const magma_int_t d_array_max_size = 32; // TEMPORARY: can become configurable parameter magma_device_t *device_array = new magma_device_t[d_array_max_size]; magma_int_t num_devices; cudaDeviceSetLimit(cudaLimitStackSize, 4096); magma_getdevices(device_array, d_array_max_size, &num_devices); device_count = (int)num_devices; delete[] device_array; Loading Loading @@ -872,11 +885,13 @@ int cluster_jxi488_cycle( outam0->write_to_disk(outam0_name); delete outam0; #endif // DEBUG_AM if (rs.use_offload) { cms_gpu(cid->am, cid->c1); } else { cms(cid->am, cid->c1); } #ifdef USE_TARGET_OFFLOAD #ifdef USE_MAGMA magmaDoubleComplex* vec_am = (magmaDoubleComplex *)(cid->am[0]); magma_cms(vec_am, cid->c1, cid->proc_device); #endif //USE_MAGMA #endif // USE_TARGET_OFFLOAD cms_gpu(cid->am[0], cid->c1); #ifdef DEBUG_AM VirtualAsciiFile *outam1 = new VirtualAsciiFile(); string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt"; Loading @@ -889,10 +904,10 @@ int cluster_jxi488_cycle( write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1); outam1->write_to_disk(outam1_name); delete outam1; #endif #endif // DEBUG_AM #ifdef USE_NVTX nvtxRangePop(); #endif #endif // USE_NVTX interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; Loading @@ -900,7 +915,7 @@ int cluster_jxi488_cycle( interval_start = chrono::high_resolution_clock::now(); #ifdef USE_NVTX nvtxRangePush("Invert the matrix"); #endif #endif // USE_NVTX invert_matrix(cid->am, ndit, jer, output_path, jxi488, mxndm, cid->proc_device, rs); #ifdef DEBUG_AM VirtualAsciiFile *outam2 = new VirtualAsciiFile(); Loading @@ -914,10 +929,10 @@ int cluster_jxi488_cycle( write_dcomplex_matrix(outam2, cid->am, ndit, ndit); outam2->write_to_disk(outam2_name); delete outam2; #endif #endif // DEBUG_AM #ifdef USE_NVTX nvtxRangePop(); #endif #endif // USE_NVTX interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; Loading @@ -931,7 +946,7 @@ int cluster_jxi488_cycle( interval_start = chrono::high_resolution_clock::now(); #ifdef USE_NVTX nvtxRangePush("Average calculation"); #endif #endif // USE_NVTX ztm(cid->am, cid->c1); #ifdef DEBUG_AM VirtualAsciiFile *outam3 = new VirtualAsciiFile(); Loading @@ -945,7 +960,7 @@ int cluster_jxi488_cycle( write_dcomplex_matrix(outam3, cid->am, ndit, ndit); outam3->write_to_disk(outam3_name); delete outam3; #endif #endif // DEBUG_AM if (idfc >= 0) { if (jxi488 == jwtm) { int nlemt = 2 * cid->c1->nlem; Loading
src/include/clu_subs.h +1 −1 Original line number Diff line number Diff line Loading @@ -119,7 +119,7 @@ void cms(dcomplex **am, ParticleDescriptor *c1); * \param am: `complex double **` * \param c1: `ParticleDescriptor *` */ void cms_gpu(dcomplex **am, ParticleDescriptor *c1); void cms_gpu(dcomplex *am, ParticleDescriptor *c1); #endif // USE_TARGET_OFFLOAD /** Loading
src/include/magma_calls.h +2 −2 Original line number Diff line number Diff line Loading @@ -21,11 +21,11 @@ * */ #include <string> #ifndef INCLUDE_MAGMA_CALLS_H_ #define INCLUDE_MAGMA_CALLS_H_ magma_int_t magma_cms(magmaDoubleComplex *vec_am, ParticleDescriptor *c1, int device_id); /** * \brief Invert a complex matrix with double precision elements. * Loading
src/libnptm/algebraic.cpp +4 −0 Original line number Diff line number Diff line Loading @@ -41,6 +41,10 @@ using namespace std; #endif #ifdef USE_MAGMA #ifndef INCLUDE_COMMONS_H_ #include "../include/Commons.h" #endif #ifndef INCLUDE_MAGMA_CALLS_H_ #include "../include/magma_calls.h" #endif Loading